Compare commits

..

2 Commits

Author SHA1 Message Date
Konstantin Knizhnik
08cf2749ca Reduce number of iteration in test_physical_replication to reduce test time 2024-05-22 11:52:31 +03:00
Konstantin Knizhnik
c85fd74d34 Fix test_physical_replication test taken in acount autocommit behaviour of psycopg 2024-04-08 17:37:48 +03:00
147 changed files with 3989 additions and 6193 deletions

View File

@@ -150,7 +150,7 @@ runs:
# Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work, # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
# and to keep files on the host to upload them to the database # and to keep files on the host to upload them to the database
time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/" time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
# Generate redirect # Generate redirect
cat <<EOF > ${WORKDIR}/index.html cat <<EOF > ${WORKDIR}/index.html

View File

@@ -10,7 +10,7 @@ inputs:
required: true required: true
api_host: api_host:
desctiption: 'Neon API host' desctiption: 'Neon API host'
default: console-stage.neon.build default: console.stage.neon.tech
outputs: outputs:
dsn: dsn:
description: 'Created Branch DSN (for main database)' description: 'Created Branch DSN (for main database)'

View File

@@ -13,7 +13,7 @@ inputs:
required: true required: true
api_host: api_host:
desctiption: 'Neon API host' desctiption: 'Neon API host'
default: console-stage.neon.build default: console.stage.neon.tech
runs: runs:
using: "composite" using: "composite"

View File

@@ -13,7 +13,7 @@ inputs:
default: 15 default: 15
api_host: api_host:
desctiption: 'Neon API host' desctiption: 'Neon API host'
default: console-stage.neon.build default: console.stage.neon.tech
provisioner: provisioner:
desctiption: 'k8s-pod or k8s-neonvm' desctiption: 'k8s-pod or k8s-neonvm'
default: 'k8s-pod' default: 'k8s-pod'

View File

@@ -10,7 +10,7 @@ inputs:
required: true required: true
api_host: api_host:
desctiption: 'Neon API host' desctiption: 'Neon API host'
default: console-stage.neon.build default: console.stage.neon.tech
runs: runs:
using: "composite" using: "composite"

View File

@@ -18,7 +18,6 @@ on:
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }} group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: false
env: env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -21,7 +21,6 @@ defaults:
concurrency: concurrency:
group: build-build-tools-image-${{ inputs.image-tag }} group: build-build-tools-image-${{ inputs.image-tag }}
cancel-in-progress: false
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {} permissions: {}

View File

@@ -735,7 +735,7 @@ jobs:
run: | run: |
mkdir -p .docker-custom mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v2 - uses: docker/setup-buildx-action@v3
- uses: docker/login-action@v3 - uses: docker/login-action@v3
with: with:
@@ -792,7 +792,7 @@ jobs:
run: | run: |
mkdir -p .docker-custom mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v2 - uses: docker/setup-buildx-action@v3
with: with:
# Disable parallelism for docker buildkit. # Disable parallelism for docker buildkit.
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner. # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
@@ -865,7 +865,7 @@ jobs:
run: run:
shell: sh -eu {0} shell: sh -eu {0}
env: env:
VM_BUILDER_VERSION: v0.28.1 VM_BUILDER_VERSION: v0.23.2
steps: steps:
- name: Checkout - name: Checkout
@@ -1133,6 +1133,8 @@ jobs:
-f deployPreprodRegion=true -f deployPreprodRegion=true
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \ gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-f deployPgSniRouter=false \
-f deployProxy=false \
-f deployStorage=true \ -f deployStorage=true \
-f deployStorageBroker=true \ -f deployStorageBroker=true \
-f deployStorageController=true \ -f deployStorageController=true \

View File

@@ -28,9 +28,7 @@ jobs:
- name: Get build-tools image tag for the current commit - name: Get build-tools image tag for the current commit
id: get-build-tools-tag id: get-build-tools-tag
env: env:
# Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs, COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
# we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
COMMIT_SHA: ${{ github.sha }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: | run: |
LAST_BUILD_TOOLS_SHA=$( LAST_BUILD_TOOLS_SHA=$(

View File

@@ -20,7 +20,6 @@ defaults:
concurrency: concurrency:
group: pin-build-tools-image-${{ inputs.from-tag }} group: pin-build-tools-image-${{ inputs.from-tag }}
cancel-in-progress: false
permissions: {} permissions: {}

429
Cargo.lock generated
View File

@@ -270,12 +270,6 @@ dependencies = [
"critical-section", "critical-section",
] ]
[[package]]
name = "atomic-take"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
[[package]] [[package]]
name = "autocfg" name = "autocfg"
version = "1.1.0" version = "1.1.0"
@@ -304,7 +298,7 @@ dependencies = [
"fastrand 2.0.0", "fastrand 2.0.0",
"hex", "hex",
"http 0.2.9", "http 0.2.9",
"hyper 0.14.26", "hyper",
"ring 0.17.6", "ring 0.17.6",
"time", "time",
"tokio", "tokio",
@@ -341,7 +335,7 @@ dependencies = [
"bytes", "bytes",
"fastrand 2.0.0", "fastrand 2.0.0",
"http 0.2.9", "http 0.2.9",
"http-body 0.4.5", "http-body",
"percent-encoding", "percent-encoding",
"pin-project-lite", "pin-project-lite",
"tracing", "tracing",
@@ -392,7 +386,7 @@ dependencies = [
"aws-types", "aws-types",
"bytes", "bytes",
"http 0.2.9", "http 0.2.9",
"http-body 0.4.5", "http-body",
"once_cell", "once_cell",
"percent-encoding", "percent-encoding",
"regex-lite", "regex-lite",
@@ -520,7 +514,7 @@ dependencies = [
"crc32fast", "crc32fast",
"hex", "hex",
"http 0.2.9", "http 0.2.9",
"http-body 0.4.5", "http-body",
"md-5", "md-5",
"pin-project-lite", "pin-project-lite",
"sha1", "sha1",
@@ -552,7 +546,7 @@ dependencies = [
"bytes-utils", "bytes-utils",
"futures-core", "futures-core",
"http 0.2.9", "http 0.2.9",
"http-body 0.4.5", "http-body",
"once_cell", "once_cell",
"percent-encoding", "percent-encoding",
"pin-project-lite", "pin-project-lite",
@@ -591,15 +585,15 @@ dependencies = [
"aws-smithy-types", "aws-smithy-types",
"bytes", "bytes",
"fastrand 2.0.0", "fastrand 2.0.0",
"h2 0.3.26", "h2",
"http 0.2.9", "http 0.2.9",
"http-body 0.4.5", "http-body",
"hyper 0.14.26", "hyper",
"hyper-rustls", "hyper-rustls",
"once_cell", "once_cell",
"pin-project-lite", "pin-project-lite",
"pin-utils", "pin-utils",
"rustls 0.21.11", "rustls 0.21.9",
"tokio", "tokio",
"tracing", "tracing",
] ]
@@ -632,7 +626,7 @@ dependencies = [
"bytes-utils", "bytes-utils",
"futures-core", "futures-core",
"http 0.2.9", "http 0.2.9",
"http-body 0.4.5", "http-body",
"itoa", "itoa",
"num-integer", "num-integer",
"pin-project-lite", "pin-project-lite",
@@ -681,8 +675,8 @@ dependencies = [
"bytes", "bytes",
"futures-util", "futures-util",
"http 0.2.9", "http 0.2.9",
"http-body 0.4.5", "http-body",
"hyper 0.14.26", "hyper",
"itoa", "itoa",
"matchit", "matchit",
"memchr", "memchr",
@@ -697,7 +691,7 @@ dependencies = [
"sha1", "sha1",
"sync_wrapper", "sync_wrapper",
"tokio", "tokio",
"tokio-tungstenite 0.20.0", "tokio-tungstenite",
"tower", "tower",
"tower-layer", "tower-layer",
"tower-service", "tower-service",
@@ -713,7 +707,7 @@ dependencies = [
"bytes", "bytes",
"futures-util", "futures-util",
"http 0.2.9", "http 0.2.9",
"http-body 0.4.5", "http-body",
"mime", "mime",
"rustversion", "rustversion",
"tower-layer", "tower-layer",
@@ -1130,7 +1124,7 @@ version = "4.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b" checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
dependencies = [ dependencies = [
"heck 0.4.1", "heck",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.52", "syn 2.0.52",
@@ -1202,7 +1196,7 @@ dependencies = [
"compute_api", "compute_api",
"flate2", "flate2",
"futures", "futures",
"hyper 0.14.26", "hyper",
"nix 0.27.1", "nix 0.27.1",
"notify", "notify",
"num_cpus", "num_cpus",
@@ -1319,7 +1313,7 @@ dependencies = [
"git-version", "git-version",
"hex", "hex",
"humantime", "humantime",
"hyper 0.14.26", "hyper",
"nix 0.27.1", "nix 0.27.1",
"once_cell", "once_cell",
"pageserver_api", "pageserver_api",
@@ -1468,9 +1462,12 @@ dependencies = [
[[package]] [[package]]
name = "crossbeam-utils" name = "crossbeam-utils"
version = "0.8.19" version = "0.8.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
dependencies = [
"cfg-if",
]
[[package]] [[package]]
name = "crossterm" name = "crossterm"
@@ -1843,12 +1840,23 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]] [[package]]
name = "errno" name = "errno"
version = "0.3.8" version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
dependencies = [ dependencies = [
"errno-dragonfly",
"libc",
"windows-sys 0.48.0",
]
[[package]]
name = "errno-dragonfly"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
dependencies = [
"cc",
"libc", "libc",
"windows-sys 0.52.0",
] ]
[[package]] [[package]]
@@ -2205,25 +2213,6 @@ dependencies = [
"tracing", "tracing",
] ]
[[package]]
name = "h2"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069"
dependencies = [
"bytes",
"fnv",
"futures-core",
"futures-sink",
"futures-util",
"http 1.1.0",
"indexmap 2.0.1",
"slab",
"tokio",
"tokio-util",
"tracing",
]
[[package]] [[package]]
name = "half" name = "half"
version = "1.8.2" version = "1.8.2"
@@ -2305,12 +2294,6 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]] [[package]]
name = "hermit-abi" name = "hermit-abi"
version = "0.3.3" version = "0.3.3"
@@ -2395,29 +2378,6 @@ dependencies = [
"pin-project-lite", "pin-project-lite",
] ]
[[package]]
name = "http-body"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643"
dependencies = [
"bytes",
"http 1.1.0",
]
[[package]]
name = "http-body-util"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840"
dependencies = [
"bytes",
"futures-util",
"http 1.1.0",
"http-body 1.0.0",
"pin-project-lite",
]
[[package]] [[package]]
name = "http-types" name = "http-types"
version = "2.12.0" version = "2.12.0"
@@ -2476,9 +2436,9 @@ dependencies = [
"futures-channel", "futures-channel",
"futures-core", "futures-core",
"futures-util", "futures-util",
"h2 0.3.26", "h2",
"http 0.2.9", "http 0.2.9",
"http-body 0.4.5", "http-body",
"httparse", "httparse",
"httpdate", "httpdate",
"itoa", "itoa",
@@ -2490,26 +2450,6 @@ dependencies = [
"want", "want",
] ]
[[package]]
name = "hyper"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a"
dependencies = [
"bytes",
"futures-channel",
"futures-util",
"h2 0.4.4",
"http 1.1.0",
"http-body 1.0.0",
"httparse",
"httpdate",
"itoa",
"pin-project-lite",
"smallvec",
"tokio",
]
[[package]] [[package]]
name = "hyper-rustls" name = "hyper-rustls"
version = "0.24.0" version = "0.24.0"
@@ -2517,9 +2457,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
dependencies = [ dependencies = [
"http 0.2.9", "http 0.2.9",
"hyper 0.14.26", "hyper",
"log", "log",
"rustls 0.21.11", "rustls 0.21.9",
"rustls-native-certs 0.6.2", "rustls-native-certs 0.6.2",
"tokio", "tokio",
"tokio-rustls 0.24.0", "tokio-rustls 0.24.0",
@@ -2531,7 +2471,7 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
dependencies = [ dependencies = [
"hyper 0.14.26", "hyper",
"pin-project-lite", "pin-project-lite",
"tokio", "tokio",
"tokio-io-timeout", "tokio-io-timeout",
@@ -2544,7 +2484,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
dependencies = [ dependencies = [
"bytes", "bytes",
"hyper 0.14.26", "hyper",
"native-tls", "native-tls",
"tokio", "tokio",
"tokio-native-tls", "tokio-native-tls",
@@ -2552,33 +2492,15 @@ dependencies = [
[[package]] [[package]]
name = "hyper-tungstenite" name = "hyper-tungstenite"
version = "0.13.0" version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad" checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
dependencies = [ dependencies = [
"http-body-util", "hyper",
"hyper 1.2.0",
"hyper-util",
"pin-project-lite", "pin-project-lite",
"tokio", "tokio",
"tokio-tungstenite 0.21.0", "tokio-tungstenite",
"tungstenite 0.21.0", "tungstenite",
]
[[package]]
name = "hyper-util"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa"
dependencies = [
"bytes",
"futures-util",
"http 1.1.0",
"http-body 1.0.0",
"hyper 1.2.0",
"pin-project-lite",
"socket2 0.5.5",
"tokio",
] ]
[[package]] [[package]]
@@ -2872,12 +2794,6 @@ version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
[[package]]
name = "linux-raw-sys"
version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
[[package]] [[package]]
name = "lock_api" name = "lock_api"
version = "0.4.10" version = "0.4.10"
@@ -2932,12 +2848,11 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]] [[package]]
name = "measured" name = "measured"
version = "0.0.21" version = "0.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5" checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
dependencies = [ dependencies = [
"bytes", "bytes",
"crossbeam-utils",
"hashbrown 0.14.0", "hashbrown 0.14.0",
"itoa", "itoa",
"lasso", "lasso",
@@ -2950,27 +2865,16 @@ dependencies = [
[[package]] [[package]]
name = "measured-derive" name = "measured-derive"
version = "0.0.21" version = "0.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d" checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
dependencies = [ dependencies = [
"heck 0.5.0", "heck",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.52", "syn 2.0.52",
] ]
[[package]]
name = "measured-process"
version = "0.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
dependencies = [
"libc",
"measured",
"procfs 0.16.0",
]
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.6.4" version = "2.6.4"
@@ -3010,10 +2914,8 @@ version = "0.1.0"
dependencies = [ dependencies = [
"chrono", "chrono",
"libc", "libc",
"measured",
"measured-process",
"once_cell", "once_cell",
"procfs 0.14.2", "procfs",
"prometheus", "prometheus",
"rand 0.8.5", "rand 0.8.5",
"rand_distr", "rand_distr",
@@ -3563,17 +3465,12 @@ dependencies = [
"camino", "camino",
"clap", "clap",
"git-version", "git-version",
"humantime",
"pageserver", "pageserver",
"pageserver_api",
"postgres_ffi", "postgres_ffi",
"remote_storage",
"serde", "serde",
"serde_json", "serde_json",
"svg_fmt", "svg_fmt",
"tokio", "tokio",
"tokio-util",
"toml_edit",
"utils", "utils",
"workspace_hack", "workspace_hack",
] ]
@@ -3609,7 +3506,7 @@ dependencies = [
"hex-literal", "hex-literal",
"humantime", "humantime",
"humantime-serde", "humantime-serde",
"hyper 0.14.26", "hyper",
"itertools", "itertools",
"leaky-bucket", "leaky-bucket",
"md5", "md5",
@@ -3628,7 +3525,7 @@ dependencies = [
"postgres_connection", "postgres_connection",
"postgres_ffi", "postgres_ffi",
"pq_proto", "pq_proto",
"procfs 0.14.2", "procfs",
"rand 0.8.5", "rand 0.8.5",
"regex", "regex",
"remote_storage", "remote_storage",
@@ -3719,6 +3616,7 @@ dependencies = [
"anyhow", "anyhow",
"async-compression", "async-compression",
"async-stream", "async-stream",
"async-trait",
"byteorder", "byteorder",
"bytes", "bytes",
"chrono", "chrono",
@@ -4059,7 +3957,7 @@ dependencies = [
"futures", "futures",
"once_cell", "once_cell",
"pq_proto", "pq_proto",
"rustls 0.22.4", "rustls 0.22.2",
"rustls-pemfile 2.1.1", "rustls-pemfile 2.1.1",
"serde", "serde",
"thiserror", "thiserror",
@@ -4188,29 +4086,6 @@ dependencies = [
"rustix 0.36.16", "rustix 0.36.16",
] ]
[[package]]
name = "procfs"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
dependencies = [
"bitflags 2.4.1",
"hex",
"lazy_static",
"procfs-core",
"rustix 0.38.28",
]
[[package]]
name = "procfs-core"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
dependencies = [
"bitflags 2.4.1",
"hex",
]
[[package]] [[package]]
name = "prometheus" name = "prometheus"
version = "0.13.3" version = "0.13.3"
@@ -4223,7 +4098,7 @@ dependencies = [
"libc", "libc",
"memchr", "memchr",
"parking_lot 0.12.1", "parking_lot 0.12.1",
"procfs 0.14.2", "procfs",
"thiserror", "thiserror",
] ]
@@ -4244,7 +4119,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
dependencies = [ dependencies = [
"bytes", "bytes",
"heck 0.4.1", "heck",
"itertools", "itertools",
"lazy_static", "lazy_static",
"log", "log",
@@ -4288,7 +4163,6 @@ dependencies = [
"anyhow", "anyhow",
"async-compression", "async-compression",
"async-trait", "async-trait",
"atomic-take",
"aws-config", "aws-config",
"aws-sdk-iam", "aws-sdk-iam",
"aws-sigv4", "aws-sigv4",
@@ -4312,17 +4186,13 @@ dependencies = [
"hmac", "hmac",
"hostname", "hostname",
"http 1.1.0", "http 1.1.0",
"http-body-util",
"humantime", "humantime",
"hyper 0.14.26", "hyper",
"hyper 1.2.0",
"hyper-tungstenite", "hyper-tungstenite",
"hyper-util",
"ipnet", "ipnet",
"itertools", "itertools",
"lasso", "lasso",
"md5", "md5",
"measured",
"metrics", "metrics",
"native-tls", "native-tls",
"once_cell", "once_cell",
@@ -4350,7 +4220,7 @@ dependencies = [
"routerify", "routerify",
"rstest", "rstest",
"rustc-hash", "rustc-hash",
"rustls 0.22.4", "rustls 0.22.2",
"rustls-pemfile 2.1.1", "rustls-pemfile 2.1.1",
"scopeguard", "scopeguard",
"serde", "serde",
@@ -4542,7 +4412,7 @@ dependencies = [
"itoa", "itoa",
"percent-encoding", "percent-encoding",
"pin-project-lite", "pin-project-lite",
"rustls 0.22.4", "rustls 0.22.2",
"rustls-native-certs 0.7.0", "rustls-native-certs 0.7.0",
"rustls-pemfile 2.1.1", "rustls-pemfile 2.1.1",
"rustls-pki-types", "rustls-pki-types",
@@ -4651,7 +4521,7 @@ dependencies = [
"futures-util", "futures-util",
"http-types", "http-types",
"humantime", "humantime",
"hyper 0.14.26", "hyper",
"itertools", "itertools",
"metrics", "metrics",
"once_cell", "once_cell",
@@ -4681,10 +4551,10 @@ dependencies = [
"encoding_rs", "encoding_rs",
"futures-core", "futures-core",
"futures-util", "futures-util",
"h2 0.3.26", "h2",
"http 0.2.9", "http 0.2.9",
"http-body 0.4.5", "http-body",
"hyper 0.14.26", "hyper",
"hyper-rustls", "hyper-rustls",
"hyper-tls", "hyper-tls",
"ipnet", "ipnet",
@@ -4696,7 +4566,7 @@ dependencies = [
"once_cell", "once_cell",
"percent-encoding", "percent-encoding",
"pin-project-lite", "pin-project-lite",
"rustls 0.21.11", "rustls 0.21.9",
"rustls-pemfile 1.0.2", "rustls-pemfile 1.0.2",
"serde", "serde",
"serde_json", "serde_json",
@@ -4742,7 +4612,7 @@ dependencies = [
"futures", "futures",
"getrandom 0.2.11", "getrandom 0.2.11",
"http 0.2.9", "http 0.2.9",
"hyper 0.14.26", "hyper",
"parking_lot 0.11.2", "parking_lot 0.11.2",
"reqwest", "reqwest",
"reqwest-middleware", "reqwest-middleware",
@@ -4829,7 +4699,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945" checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
dependencies = [ dependencies = [
"http 0.2.9", "http 0.2.9",
"hyper 0.14.26", "hyper",
"lazy_static", "lazy_static",
"percent-encoding", "percent-encoding",
"regex", "regex",
@@ -4941,24 +4811,11 @@ dependencies = [
"windows-sys 0.48.0", "windows-sys 0.48.0",
] ]
[[package]]
name = "rustix"
version = "0.38.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316"
dependencies = [
"bitflags 2.4.1",
"errno",
"libc",
"linux-raw-sys 0.4.13",
"windows-sys 0.52.0",
]
[[package]] [[package]]
name = "rustls" name = "rustls"
version = "0.21.11" version = "0.21.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4" checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9"
dependencies = [ dependencies = [
"log", "log",
"ring 0.17.6", "ring 0.17.6",
@@ -4968,9 +4825,9 @@ dependencies = [
[[package]] [[package]]
name = "rustls" name = "rustls"
version = "0.22.4" version = "0.22.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432" checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41"
dependencies = [ dependencies = [
"log", "log",
"ring 0.17.6", "ring 0.17.6",
@@ -5134,7 +4991,7 @@ dependencies = [
"git-version", "git-version",
"hex", "hex",
"humantime", "humantime",
"hyper 0.14.26", "hyper",
"metrics", "metrics",
"once_cell", "once_cell",
"parking_lot 0.12.1", "parking_lot 0.12.1",
@@ -5282,7 +5139,7 @@ checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
dependencies = [ dependencies = [
"httpdate", "httpdate",
"reqwest", "reqwest",
"rustls 0.21.11", "rustls 0.21.9",
"sentry-backtrace", "sentry-backtrace",
"sentry-contexts", "sentry-contexts",
"sentry-core", "sentry-core",
@@ -5619,9 +5476,9 @@ dependencies = [
[[package]] [[package]]
name = "smallvec" name = "smallvec"
version = "1.13.1" version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
[[package]] [[package]]
name = "smol_str" name = "smol_str"
@@ -5713,7 +5570,7 @@ dependencies = [
"futures-util", "futures-util",
"git-version", "git-version",
"humantime", "humantime",
"hyper 0.14.26", "hyper",
"metrics", "metrics",
"once_cell", "once_cell",
"parking_lot 0.12.1", "parking_lot 0.12.1",
@@ -5744,7 +5601,7 @@ dependencies = [
"git-version", "git-version",
"hex", "hex",
"humantime", "humantime",
"hyper 0.14.26", "hyper",
"itertools", "itertools",
"lasso", "lasso",
"measured", "measured",
@@ -5773,7 +5630,7 @@ dependencies = [
"anyhow", "anyhow",
"clap", "clap",
"comfy-table", "comfy-table",
"hyper 0.14.26", "hyper",
"pageserver_api", "pageserver_api",
"pageserver_client", "pageserver_client",
"reqwest", "reqwest",
@@ -5814,7 +5671,7 @@ version = "0.24.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
dependencies = [ dependencies = [
"heck 0.4.1", "heck",
"proc-macro2", "proc-macro2",
"quote", "quote",
"rustversion", "rustversion",
@@ -6193,7 +6050,7 @@ checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
dependencies = [ dependencies = [
"futures", "futures",
"ring 0.17.6", "ring 0.17.6",
"rustls 0.22.4", "rustls 0.22.2",
"tokio", "tokio",
"tokio-postgres", "tokio-postgres",
"tokio-rustls 0.25.0", "tokio-rustls 0.25.0",
@@ -6206,7 +6063,7 @@ version = "0.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
dependencies = [ dependencies = [
"rustls 0.21.11", "rustls 0.21.9",
"tokio", "tokio",
] ]
@@ -6216,7 +6073,7 @@ version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f" checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f"
dependencies = [ dependencies = [
"rustls 0.22.4", "rustls 0.22.2",
"rustls-pki-types", "rustls-pki-types",
"tokio", "tokio",
] ]
@@ -6256,19 +6113,7 @@ dependencies = [
"futures-util", "futures-util",
"log", "log",
"tokio", "tokio",
"tungstenite 0.20.1", "tungstenite",
]
[[package]]
name = "tokio-tungstenite"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite 0.21.0",
] ]
[[package]] [[package]]
@@ -6335,10 +6180,10 @@ dependencies = [
"bytes", "bytes",
"futures-core", "futures-core",
"futures-util", "futures-util",
"h2 0.3.26", "h2",
"http 0.2.9", "http 0.2.9",
"http-body 0.4.5", "http-body",
"hyper 0.14.26", "hyper",
"hyper-timeout", "hyper-timeout",
"percent-encoding", "percent-encoding",
"pin-project", "pin-project",
@@ -6524,7 +6369,7 @@ dependencies = [
name = "tracing-utils" name = "tracing-utils"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"hyper 0.14.26", "hyper",
"opentelemetry", "opentelemetry",
"opentelemetry-otlp", "opentelemetry-otlp",
"opentelemetry-semantic-conventions", "opentelemetry-semantic-conventions",
@@ -6561,25 +6406,6 @@ dependencies = [
"utf-8", "utf-8",
] ]
[[package]]
name = "tungstenite"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
dependencies = [
"byteorder",
"bytes",
"data-encoding",
"http 1.1.0",
"httparse",
"log",
"rand 0.8.5",
"sha1",
"thiserror",
"url",
"utf-8",
]
[[package]] [[package]]
name = "twox-hash" name = "twox-hash"
version = "1.6.3" version = "1.6.3"
@@ -6677,7 +6503,7 @@ dependencies = [
"base64 0.21.1", "base64 0.21.1",
"log", "log",
"once_cell", "once_cell",
"rustls 0.21.11", "rustls 0.21.9",
"rustls-webpki 0.100.2", "rustls-webpki 0.100.2",
"url", "url",
"webpki-roots 0.23.1", "webpki-roots 0.23.1",
@@ -6744,8 +6570,7 @@ dependencies = [
"heapless", "heapless",
"hex", "hex",
"hex-literal", "hex-literal",
"humantime", "hyper",
"hyper 0.14.26",
"jsonwebtoken", "jsonwebtoken",
"leaky-bucket", "leaky-bucket",
"metrics", "metrics",
@@ -7105,15 +6930,6 @@ dependencies = [
"windows-targets 0.48.0", "windows-targets 0.48.0",
] ]
[[package]]
name = "windows-sys"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets 0.52.4",
]
[[package]] [[package]]
name = "windows-targets" name = "windows-targets"
version = "0.42.2" version = "0.42.2"
@@ -7144,21 +6960,6 @@ dependencies = [
"windows_x86_64_msvc 0.48.0", "windows_x86_64_msvc 0.48.0",
] ]
[[package]]
name = "windows-targets"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
dependencies = [
"windows_aarch64_gnullvm 0.52.4",
"windows_aarch64_msvc 0.52.4",
"windows_i686_gnu 0.52.4",
"windows_i686_msvc 0.52.4",
"windows_x86_64_gnu 0.52.4",
"windows_x86_64_gnullvm 0.52.4",
"windows_x86_64_msvc 0.52.4",
]
[[package]] [[package]]
name = "windows_aarch64_gnullvm" name = "windows_aarch64_gnullvm"
version = "0.42.2" version = "0.42.2"
@@ -7171,12 +6972,6 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
[[package]] [[package]]
name = "windows_aarch64_msvc" name = "windows_aarch64_msvc"
version = "0.42.2" version = "0.42.2"
@@ -7189,12 +6984,6 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
[[package]] [[package]]
name = "windows_i686_gnu" name = "windows_i686_gnu"
version = "0.42.2" version = "0.42.2"
@@ -7207,12 +6996,6 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
[[package]]
name = "windows_i686_gnu"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
[[package]] [[package]]
name = "windows_i686_msvc" name = "windows_i686_msvc"
version = "0.42.2" version = "0.42.2"
@@ -7225,12 +7008,6 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
[[package]]
name = "windows_i686_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
[[package]] [[package]]
name = "windows_x86_64_gnu" name = "windows_x86_64_gnu"
version = "0.42.2" version = "0.42.2"
@@ -7243,12 +7020,6 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
[[package]] [[package]]
name = "windows_x86_64_gnullvm" name = "windows_x86_64_gnullvm"
version = "0.42.2" version = "0.42.2"
@@ -7261,12 +7032,6 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
[[package]] [[package]]
name = "windows_x86_64_msvc" name = "windows_x86_64_msvc"
version = "0.42.2" version = "0.42.2"
@@ -7279,12 +7044,6 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
[[package]] [[package]]
name = "winnow" name = "winnow"
version = "0.4.6" version = "0.4.6"
@@ -7333,10 +7092,11 @@ dependencies = [
"futures-sink", "futures-sink",
"futures-util", "futures-util",
"getrandom 0.2.11", "getrandom 0.2.11",
"hashbrown 0.13.2",
"hashbrown 0.14.0", "hashbrown 0.14.0",
"hex", "hex",
"hmac", "hmac",
"hyper 0.14.26", "hyper",
"indexmap 1.9.3", "indexmap 1.9.3",
"itertools", "itertools",
"libc", "libc",
@@ -7354,7 +7114,7 @@ dependencies = [
"regex-automata 0.4.3", "regex-automata 0.4.3",
"regex-syntax 0.8.2", "regex-syntax 0.8.2",
"reqwest", "reqwest",
"rustls 0.21.11", "rustls 0.21.9",
"scopeguard", "scopeguard",
"serde", "serde",
"serde_json", "serde_json",
@@ -7374,6 +7134,7 @@ dependencies = [
"tower", "tower",
"tracing", "tracing",
"tracing-core", "tracing-core",
"tungstenite",
"url", "url",
"uuid", "uuid",
"zeroize", "zeroize",

View File

@@ -44,7 +44,6 @@ license = "Apache-2.0"
anyhow = { version = "1.0", features = ["backtrace"] } anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6" arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
azure_core = "0.18" azure_core = "0.18"
azure_identity = "0.18" azure_identity = "0.18"
azure_storage = "0.18" azure_storage = "0.18"
@@ -98,7 +97,7 @@ http-types = { version = "2", default-features = false }
humantime = "2.1" humantime = "2.1"
humantime-serde = "1.1.1" humantime-serde = "1.1.1"
hyper = "0.14" hyper = "0.14"
hyper-tungstenite = "0.13.0" hyper-tungstenite = "0.11"
inotify = "0.10.2" inotify = "0.10.2"
ipnet = "2.9.0" ipnet = "2.9.0"
itertools = "0.10" itertools = "0.10"
@@ -107,8 +106,7 @@ lasso = "0.7"
leaky-bucket = "1.0.1" leaky-bucket = "1.0.1"
libc = "0.2" libc = "0.2"
md5 = "0.7.0" md5 = "0.7.0"
measured = { version = "0.0.21", features=["lasso"] } measured = { version = "0.0.13", features=["default", "lasso"] }
measured-process = { version = "0.0.21" }
memoffset = "0.8" memoffset = "0.8"
native-tls = "0.2" native-tls = "0.2"
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -252,7 +250,7 @@ debug = true
# disable debug symbols for all packages except this one to decrease binaries size # disable debug symbols for all packages except this one to decrease binaries size
[profile.release.package."*"] [profile.release.package."*"]
debug = true debug = false
[profile.release-line-debug] [profile.release-line-debug]
inherits = "release" inherits = "release"

View File

@@ -44,7 +44,6 @@ COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_i
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
COPY --chown=nonroot . . COPY --chown=nonroot . .
ENV _RJEM_MALLOC_CONF="prof:true"
# Show build caching stats to check if it was used in the end. # Show build caching stats to check if it was used in the end.
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
RUN set -e \ RUN set -e \

View File

@@ -58,12 +58,6 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
&& mv protoc/include/google /usr/local/include/google \ && mv protoc/include/google /usr/local/include/google \
&& rm -rf protoc.zip protoc && rm -rf protoc.zip protoc
# s5cmd
ENV S5CMD_VERSION=2.2.2
RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
&& chmod +x s5cmd \
&& mv s5cmd /usr/local/bin/s5cmd
# LLVM # LLVM
ENV LLVM_VERSION=17 ENV LLVM_VERSION=17
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \

View File

@@ -818,15 +818,9 @@ impl ComputeNode {
Client::connect(zenith_admin_connstr.as_str(), NoTls) Client::connect(zenith_admin_connstr.as_str(), NoTls)
.context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
// Disable forwarding so that users don't get a cloud_admin role // Disable forwarding so that users don't get a cloud_admin role
client.simple_query("SET neon.forward_ddl = false")?;
let mut func = || { client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
client.simple_query("SET neon.forward_ddl = false")?; client.simple_query("GRANT zenith_admin TO cloud_admin")?;
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
Ok::<_, anyhow::Error>(())
};
func().context("apply_config setup cloud_admin")?;
drop(client); drop(client);
// reconnect with connstring with expected name // reconnect with connstring with expected name
@@ -838,29 +832,24 @@ impl ComputeNode {
}; };
// Disable DDL forwarding because control plane already knows about these roles/databases. // Disable DDL forwarding because control plane already knows about these roles/databases.
client client.simple_query("SET neon.forward_ddl = false")?;
.simple_query("SET neon.forward_ddl = false")
.context("apply_config SET neon.forward_ddl = false")?;
// Proceed with post-startup configuration. Note, that order of operations is important. // Proceed with post-startup configuration. Note, that order of operations is important.
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?; create_neon_superuser(spec, &mut client)?;
cleanup_instance(&mut client).context("apply_config cleanup_instance")?; cleanup_instance(&mut client)?;
handle_roles(spec, &mut client).context("apply_config handle_roles")?; handle_roles(spec, &mut client)?;
handle_databases(spec, &mut client).context("apply_config handle_databases")?; handle_databases(spec, &mut client)?;
handle_role_deletions(spec, connstr.as_str(), &mut client) handle_role_deletions(spec, connstr.as_str(), &mut client)?;
.context("apply_config handle_role_deletions")?;
handle_grants( handle_grants(
spec, spec,
&mut client, &mut client,
connstr.as_str(), connstr.as_str(),
self.has_feature(ComputeFeature::AnonExtension), self.has_feature(ComputeFeature::AnonExtension),
) )?;
.context("apply_config handle_grants")?; handle_extensions(spec, &mut client)?;
handle_extensions(spec, &mut client).context("apply_config handle_extensions")?; handle_extension_neon(&mut client)?;
handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?; create_availability_check_data(&mut client)?;
create_availability_check_data(&mut client)
.context("apply_config create_availability_check_data")?;
// 'Close' connection // 'Close' connection
drop(client); drop(client);
@@ -868,7 +857,7 @@ impl ComputeNode {
// Run migrations separately to not hold up cold starts // Run migrations separately to not hold up cold starts
thread::spawn(move || { thread::spawn(move || {
let mut client = Client::connect(connstr.as_str(), NoTls)?; let mut client = Client::connect(connstr.as_str(), NoTls)?;
handle_migrations(&mut client).context("apply_config handle_migrations") handle_migrations(&mut client)
}); });
Ok(()) Ok(())
} }

View File

@@ -6,8 +6,8 @@ use std::path::Path;
use anyhow::Result; use anyhow::Result;
use crate::pg_helpers::escape_conf_value; use crate::pg_helpers::escape_conf_value;
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize}; use crate::pg_helpers::PgOptionsSerialize;
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption}; use compute_api::spec::{ComputeMode, ComputeSpec};
/// Check that `line` is inside a text file and put it there if it is not. /// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist. /// Create file if it doesn't exist.
@@ -92,27 +92,6 @@ pub fn write_postgres_conf(
} }
} }
if cfg!(target_os = "linux") {
// Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
// disabled), then the control plane has enabled swap and we should set
// dynamic_shared_memory_type = 'mmap'.
//
// This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
// ignore any errors - they may be expected to occur under certain situations (e.g. when
// not running in Linux).
.unwrap_or_else(|_| String::new());
if overcommit_memory_contents.trim() == "2" {
let opt = GenericOption {
name: "dynamic_shared_memory_type".to_owned(),
value: Some("mmap".to_owned()),
vartype: "enum".to_owned(),
};
write!(file, "{}", opt.to_pg_setting())?;
}
}
// If there are any extra options in the 'settings' field, append those // If there are any extra options in the 'settings' field, append those
if spec.cluster.settings.is_some() { if spec.cluster.settings.is_some() {
writeln!(file, "# Managed by compute_ctl: begin")?; writeln!(file, "# Managed by compute_ctl: begin")?;

View File

@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
format!("'{}'", res) format!("'{}'", res)
} }
pub trait GenericOptionExt { trait GenericOptionExt {
fn to_pg_option(&self) -> String; fn to_pg_option(&self) -> String;
fn to_pg_setting(&self) -> String; fn to_pg_setting(&self) -> String;
} }

View File

@@ -2,7 +2,7 @@ use std::fs::File;
use std::path::Path; use std::path::Path;
use std::str::FromStr; use std::str::FromStr;
use anyhow::{anyhow, bail, Context, Result}; use anyhow::{anyhow, bail, Result};
use postgres::config::Config; use postgres::config::Config;
use postgres::{Client, NoTls}; use postgres::{Client, NoTls};
use reqwest::StatusCode; use reqwest::StatusCode;
@@ -698,8 +698,7 @@ pub fn handle_grants(
// it is important to run this after all grants // it is important to run this after all grants
if enable_anon_extension { if enable_anon_extension {
handle_extension_anon(spec, &db.owner, &mut db_client, false) handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
.context("handle_grants handle_extension_anon")?;
} }
} }
@@ -814,36 +813,28 @@ $$;"#,
// Add new migrations below. // Add new migrations below.
]; ];
let mut func = || { let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; client.simple_query(query)?;
client.simple_query(query)?;
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
client.simple_query(query)?; client.simple_query(query)?;
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
client.simple_query(query)?; client.simple_query(query)?;
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
client.simple_query(query)?; client.simple_query(query)?;
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
client.simple_query(query)?; client.simple_query(query)?;
Ok::<_, anyhow::Error>(())
};
func().context("handle_migrations prepare")?;
let query = "SELECT id FROM neon_migration.migration_id"; query = "SELECT id FROM neon_migration.migration_id";
let row = client let row = client.query_one(query, &[])?;
.query_one(query, &[])
.context("handle_migrations get migration_id")?;
let mut current_migration: usize = row.get::<&str, i64>("id") as usize; let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
let starting_migration_id = current_migration; let starting_migration_id = current_migration;
let query = "BEGIN"; query = "BEGIN";
client client.simple_query(query)?;
.simple_query(query)
.context("handle_migrations begin")?;
while current_migration < migrations.len() { while current_migration < migrations.len() {
let migration = &migrations[current_migration]; let migration = &migrations[current_migration];
@@ -851,9 +842,7 @@ $$;"#,
info!("Skip migration id={}", current_migration); info!("Skip migration id={}", current_migration);
} else { } else {
info!("Running migration:\n{}\n", migration); info!("Running migration:\n{}\n", migration);
client.simple_query(migration).with_context(|| { client.simple_query(migration)?;
format!("handle_migrations current_migration={}", current_migration)
})?;
} }
current_migration += 1; current_migration += 1;
} }
@@ -861,14 +850,10 @@ $$;"#,
"UPDATE neon_migration.migration_id SET id={}", "UPDATE neon_migration.migration_id SET id={}",
migrations.len() migrations.len()
); );
client client.simple_query(&setval)?;
.simple_query(&setval)
.context("handle_migrations update id")?;
let query = "COMMIT"; query = "COMMIT";
client client.simple_query(query)?;
.simple_query(query)
.context("handle_migrations commit")?;
info!( info!(
"Ran {} migrations", "Ran {} migrations",

View File

@@ -86,10 +86,7 @@ where
.stdout(process_log_file) .stdout(process_log_file)
.stderr(same_file_for_stderr) .stderr(same_file_for_stderr)
.args(args); .args(args);
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
fill_rust_env_vars(background_command),
));
filled_cmd.envs(envs); filled_cmd.envs(envs);
let pid_file_to_check = match &initial_pid_file { let pid_file_to_check = match &initial_pid_file {
@@ -271,15 +268,6 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
cmd cmd
} }
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
for (var, val) in std::env::vars() {
if var.starts_with("NEON_PAGESERVER_") {
cmd = cmd.env(var, val);
}
}
cmd
}
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(), /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
/// 1. Claims a pidfile with a fcntl lock on it and /// 1. Claims a pidfile with a fcntl lock on it and
/// 2. Sets up the pidfile's file descriptor so that it (and the lock) /// 2. Sets up the pidfile's file descriptor so that it (and the lock)

View File

@@ -1231,7 +1231,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
match ComputeControlPlane::load(env.clone()) { match ComputeControlPlane::load(env.clone()) {
Ok(cplane) => { Ok(cplane) => {
for (_k, node) in cplane.endpoints { for (_k, node) in cplane.endpoints {
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) { if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
eprintln!("postgres stop failed: {e:#}"); eprintln!("postgres stop failed: {e:#}");
} }
} }
@@ -1417,7 +1417,6 @@ fn cli() -> Command {
.subcommand( .subcommand(
Command::new("timeline") Command::new("timeline")
.about("Manage timelines") .about("Manage timelines")
.arg_required_else_help(true)
.subcommand(Command::new("list") .subcommand(Command::new("list")
.about("List all timelines, available to this pageserver") .about("List all timelines, available to this pageserver")
.arg(tenant_id_arg.clone())) .arg(tenant_id_arg.clone()))

View File

@@ -156,7 +156,6 @@ pub struct SafekeeperConf {
pub remote_storage: Option<String>, pub remote_storage: Option<String>,
pub backup_threads: Option<u32>, pub backup_threads: Option<u32>,
pub auth_enabled: bool, pub auth_enabled: bool,
pub listen_addr: Option<String>,
} }
impl Default for SafekeeperConf { impl Default for SafekeeperConf {
@@ -170,7 +169,6 @@ impl Default for SafekeeperConf {
remote_storage: None, remote_storage: None,
backup_threads: None, backup_threads: None,
auth_enabled: false, auth_enabled: false,
listen_addr: None,
} }
} }
} }

View File

@@ -70,31 +70,24 @@ pub struct SafekeeperNode {
pub pg_connection_config: PgConnectionConfig, pub pg_connection_config: PgConnectionConfig,
pub env: LocalEnv, pub env: LocalEnv,
pub http_client: reqwest::Client, pub http_client: reqwest::Client,
pub listen_addr: String,
pub http_base_url: String, pub http_base_url: String,
} }
impl SafekeeperNode { impl SafekeeperNode {
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
listen_addr.clone()
} else {
"127.0.0.1".to_string()
};
SafekeeperNode { SafekeeperNode {
id: conf.id, id: conf.id,
conf: conf.clone(), conf: conf.clone(),
pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port), pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
env: env.clone(), env: env.clone(),
http_client: reqwest::Client::new(), http_client: reqwest::Client::new(),
http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port), http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
listen_addr,
} }
} }
/// Construct libpq connection string for connecting to this safekeeper. /// Construct libpq connection string for connecting to this safekeeper.
fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig { fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port) PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
} }
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
@@ -118,8 +111,8 @@ impl SafekeeperNode {
); );
io::stdout().flush().unwrap(); io::stdout().flush().unwrap();
let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port); let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port); let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
let id = self.id; let id = self.id;
let datadir = self.datadir_path(); let datadir = self.datadir_path();
@@ -146,7 +139,7 @@ impl SafekeeperNode {
availability_zone, availability_zone,
]; ];
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port { if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port); let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]); args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
} }
if !self.conf.sync { if !self.conf.sync {

View File

@@ -1,15 +1,15 @@
use std::{collections::HashMap, str::FromStr, time::Duration}; use std::{collections::HashMap, str::FromStr};
use clap::{Parser, Subcommand}; use clap::{Parser, Subcommand};
use hyper::{Method, StatusCode}; use hyper::Method;
use pageserver_api::{ use pageserver_api::{
controller_api::{ controller_api::{
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
TenantDescribeResponse, TenantPolicyRequest, TenantDescribeResponse, TenantPolicyRequest,
}, },
models::{ models::{
LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest, ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse, TenantShardSplitRequest, TenantShardSplitResponse,
}, },
shard::{ShardStripeSize, TenantShardId}, shard::{ShardStripeSize, TenantShardId},
}; };
@@ -120,12 +120,6 @@ enum Command {
#[arg(long)] #[arg(long)]
tenant_id: TenantId, tenant_id: TenantId,
}, },
/// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
/// mode so that it can warm up content on a pageserver.
TenantWarmup {
#[arg(long)]
tenant_id: TenantId,
},
} }
#[derive(Parser)] #[derive(Parser)]
@@ -587,94 +581,6 @@ async fn main() -> anyhow::Result<()> {
} }
println!("{table}"); println!("{table}");
} }
Command::TenantWarmup { tenant_id } => {
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await;
match describe_response {
Ok(describe) => {
if matches!(describe.policy, PlacementPolicy::Secondary) {
// Fine: it's already known to controller in secondary mode: calling
// again to put it into secondary mode won't cause problems.
} else {
anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
}
}
Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
// Fine: this tenant isn't know to the storage controller yet.
}
Err(e) => {
// Unexpected API error
return Err(e.into());
}
}
vps_client
.location_config(
TenantShardId::unsharded(tenant_id),
pageserver_api::models::LocationConfig {
mode: pageserver_api::models::LocationConfigMode::Secondary,
generation: None,
secondary_conf: Some(LocationConfigSecondary { warm: true }),
shard_number: 0,
shard_count: 0,
shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
tenant_conf: TenantConfig::default(),
},
None,
true,
)
.await?;
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await?;
let secondary_ps_id = describe_response
.shards
.first()
.unwrap()
.node_secondary
.first()
.unwrap();
println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
loop {
let (status, progress) = vps_client
.tenant_secondary_download(
TenantShardId::unsharded(tenant_id),
Some(Duration::from_secs(10)),
)
.await?;
println!(
"Progress: {}/{} layers, {}/{} bytes",
progress.layers_downloaded,
progress.layers_total,
progress.bytes_downloaded,
progress.bytes_total
);
match status {
StatusCode::OK => {
println!("Download complete");
break;
}
StatusCode::ACCEPTED => {
// Loop
}
_ => {
anyhow::bail!("Unexpected download status: {status}");
}
}
}
}
} }
Ok(()) Ok(())

View File

@@ -1,150 +0,0 @@
# Storage Controller
## Concepts
The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
the underlying details of how data is spread across multiple nodes.
The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
## APIs
The storage controllers HTTP server implements four logically separate APIs:
- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because thats where clients expect to find it on a pageserver.
- `/control/v1/...` path is the storage controllers API, which enables operations such as registering and management pageservers, or executing shard splits.
- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
to ensure data safety with generation numbers.
The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers APIs).
See the `http.rs` file in the source for where the HTTP APIs are implemented.
## Database
The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
rebuilt on startup.
The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
The `diesel` crate is used for defining models & migrations.
Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controllers database.
### Diesel tip: migrations
If you need to modify the database schema, heres how to create a migration:
- Install the diesel CLI with `cargo install diesel_cli`
- Use `diesel migration generate <name>` to create a new migration
- Populate the SQL files in the `migrations/` subdirectory
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
- Commit the migration files and the changes to schema.rs
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once youve committed a migration no further steps are needed.
## storcon_cli
The `storcon_cli` tool enables interactive management of the storage controller. This is usually
only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
`storcon_cli --help` includes details on commands.
# Deploying
This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
part of a self-hosted system.
_General note: since the default `neon_local` environment includes a storage controller, this is a useful
reference when figuring out deployment._
## Database
It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
Set the URL to the database using the `--database-url` CLI option.
There is no need to run migrations manually: the storage controller automatically applies migrations
when it starts up.
## Configure pageservers to use the storage controller
1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
with the storage controller when it starts up. See the example below for the format of this file.
### Example `metadata.json`
```
{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
```
- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
postgres runs.
- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
the storage controller runs.
## Handle compute notifications.
The storage controller independently moves tenant attachments between pageservers in response to
changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
location changes.
The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
the compute hook.
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
```
struct ComputeHookNotifyRequestShard {
node_id: NodeId,
shard_number: ShardNumber,
}
struct ComputeHookNotifyRequest {
tenant_id: TenantId,
stripe_size: Option<ShardStripeSize>,
shards: Vec<ComputeHookNotifyRequestShard>,
}
```
When a notification is received:
1. Modify postgres configuration for this tenant:
- set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
shards identified by `NodeId` must be converted to the address+port of the node.
- if stripe_size is not None, set `neon.stripe_size` to this value
2. Send SIGHUP to postgres to reload configuration
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
will retry the notification until it succeeds..
### Example notification body
```
{
"tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
"stripe_size": 32768,
"shards": [
{"node_id": 344, "shard_number": 0},
{"node_id": 722, "shard_number": 1},
],
}
```

View File

@@ -10,13 +10,11 @@ libc.workspace = true
once_cell.workspace = true once_cell.workspace = true
chrono.workspace = true chrono.workspace = true
twox-hash.workspace = true twox-hash.workspace = true
measured.workspace = true
workspace_hack.workspace = true workspace_hack.workspace = true
[target.'cfg(target_os = "linux")'.dependencies] [target.'cfg(target_os = "linux")'.dependencies]
procfs.workspace = true procfs.workspace = true
measured-process.workspace = true
[dev-dependencies] [dev-dependencies]
rand = "0.8" rand = "0.8"

View File

@@ -7,19 +7,14 @@
//! use significantly less memory than this, but can only approximate the cardinality. //! use significantly less memory than this, but can only approximate the cardinality.
use std::{ use std::{
hash::{BuildHasher, BuildHasherDefault, Hash}, collections::HashMap,
sync::atomic::AtomicU8, hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
sync::{atomic::AtomicU8, Arc, RwLock},
}; };
use measured::{ use prometheus::{
label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, core::{self, Describer},
metric::{ proto, Opts,
group::{Encoding, MetricValue},
name::MetricNameEncoder,
Metric, MetricType, MetricVec,
},
text::TextEncoder,
LabelGroup,
}; };
use twox_hash::xxh3; use twox_hash::xxh3;
@@ -98,25 +93,203 @@ macro_rules! register_hll {
/// ``` /// ```
/// ///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha /// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>; #[derive(Clone)]
pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>; pub struct HyperLogLogVec<const N: usize> {
core: Arc<HyperLogLogVecCore<N>>,
pub struct HyperLogLogState<const N: usize> {
shards: [AtomicU8; N],
} }
impl<const N: usize> Default for HyperLogLogState<N> {
fn default() -> Self { struct HyperLogLogVecCore<const N: usize> {
#[allow(clippy::declare_interior_mutable_const)] pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
const ZERO: AtomicU8 = AtomicU8::new(0); pub desc: core::Desc,
Self { shards: [ZERO; N] } pub opts: Opts,
}
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
}
fn collect(&self) -> Vec<proto::MetricFamily> {
let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
for child in self.core.children.read().unwrap().values() {
child.core.collect_into(&mut metrics);
}
m.set_metric(metrics);
vec![m]
} }
} }
impl<const N: usize> MetricType for HyperLogLogState<N> { impl<const N: usize> HyperLogLogVec<N> {
type Metadata = (); /// Create a new [`HyperLogLogVec`] based on the provided
/// [`Opts`] and partitioned by the given label names. At least one label name must be
/// provided.
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
let opts = opts.variable_labels(variable_names);
let desc = opts.describe()?;
let v = HyperLogLogVecCore {
children: RwLock::new(HashMap::default()),
desc,
opts,
};
Ok(Self { core: Arc::new(v) })
}
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
/// of label values (same order as the VariableLabels in Desc). If that combination of
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
self.core.get_metric_with_label_values(vals)
}
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
/// occurs.
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
self.get_metric_with_label_values(vals).unwrap()
}
} }
impl<const N: usize> HyperLogLogState<N> { impl<const N: usize> HyperLogLogVecCore<N> {
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let h = self.hash_label_values(vals)?;
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
return Ok(metric);
}
self.get_or_create_metric(h, vals)
}
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
if vals.len() != self.desc.variable_labels.len() {
return Err(prometheus::Error::InconsistentCardinality {
expect: self.desc.variable_labels.len(),
got: vals.len(),
});
}
let mut h = xxh3::Hash64::default();
for val in vals {
h.write(val.as_bytes());
}
Ok(h.finish())
}
fn get_or_create_metric(
&self,
hash: u64,
label_values: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let mut children = self.children.write().unwrap();
// Check exist first.
if let Some(metric) = children.get(&hash).cloned() {
return Ok(metric);
}
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
children.insert(hash, metric.clone());
Ok(metric)
}
}
/// HLL is a probabilistic cardinality measure.
///
/// How to use this time-series for a metric name `my_metrics_total_hll`:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// If you want an estimate over time, you can use the following query:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
/// ) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// In the case of low cardinality, you might want to use the linear counting approximation:
///
/// ```promql
/// # LinearCounting(m, V) = m log (m / V)
/// shards_count * ln(shards_count /
/// # calculate V = how many shards contain a 0
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
/// )
/// ```
///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
#[derive(Clone)]
pub struct HyperLogLog<const N: usize> {
core: Arc<HyperLogLogCore<N>>,
}
impl<const N: usize> HyperLogLog<N> {
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let opts = Opts::new(name, help);
Self::with_opts(opts)
}
/// Create a [`HyperLogLog`] with the `opts` options.
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
Self::with_opts_and_label_values(&opts, &[])
}
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
let desc = opts.describe()?;
let labels = make_label_pairs(&desc, label_values)?;
let v = HyperLogLogCore {
shards: [0; N].map(AtomicU8::new),
desc,
labels,
};
Ok(Self { core: Arc::new(v) })
}
pub fn measure(&self, item: &impl Hash) { pub fn measure(&self, item: &impl Hash) {
// changing the hasher will break compatibility with previous measurements. // changing the hasher will break compatibility with previous measurements.
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item)); self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
@@ -126,11 +299,42 @@ impl<const N: usize> HyperLogLogState<N> {
let p = N.ilog2() as u8; let p = N.ilog2() as u8;
let j = hash & (N as u64 - 1); let j = hash & (N as u64 - 1);
let rho = (hash >> p).leading_zeros() as u8 + 1 - p; let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
}
}
struct HyperLogLogCore<const N: usize> {
shards: [AtomicU8; N],
desc: core::Desc,
labels: Vec<proto::LabelPair>,
}
impl<const N: usize> core::Collector for HyperLogLog<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
} }
fn take_sample(&self) -> [u8; N] { fn collect(&self) -> Vec<proto::MetricFamily> {
self.shards.each_ref().map(|x| { let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
self.core.collect_into(&mut metrics);
m.set_metric(metrics);
vec![m]
}
}
impl<const N: usize> HyperLogLogCore<N> {
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
self.shards.iter().enumerate().for_each(|(i, x)| {
let mut shard_label = proto::LabelPair::default();
shard_label.set_name("hll_shard".to_owned());
shard_label.set_value(format!("{i}"));
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus. // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
// This seems like it would be a race condition, // This seems like it would be a race condition,
@@ -140,90 +344,85 @@ impl<const N: usize> HyperLogLogState<N> {
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window. // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
// this would mean that a dev port-forwarding the metrics url won't break the sampling. // this would mean that a dev port-forwarding the metrics url won't break the sampling.
x.swap(0, std::sync::atomic::Ordering::Relaxed) let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
let mut m = proto::Metric::default();
let mut c = proto::Gauge::default();
c.set_value(v as f64);
m.set_gauge(c);
let mut labels = Vec::with_capacity(self.labels.len() + 1);
labels.extend_from_slice(&self.labels);
labels.push(shard_label);
m.set_label(labels);
metrics.push(m);
}) })
} }
} }
impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
for HyperLogLogState<N> fn make_label_pairs(
{ desc: &core::Desc,
fn write_type( label_values: &[&str],
name: impl MetricNameEncoder, ) -> prometheus::Result<Vec<proto::LabelPair>> {
enc: &mut TextEncoder<W>, if desc.variable_labels.len() != label_values.len() {
) -> Result<(), std::io::Error> { return Err(prometheus::Error::InconsistentCardinality {
enc.write_type(&name, measured::text::MetricType::Gauge) expect: desc.variable_labels.len(),
got: label_values.len(),
});
} }
fn collect_into(
&self,
_: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut TextEncoder<W>,
) -> Result<(), std::io::Error> {
struct I64(i64);
impl LabelValue for I64 {
fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
v.write_int(self.0)
}
}
struct HllShardLabel { let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
hll_shard: i64, if total_len == 0 {
} return Ok(vec![]);
impl LabelGroup for HllShardLabel {
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
const LE: &LabelName = LabelName::from_str("hll_shard");
v.write_value(LE, &I64(self.hll_shard));
}
}
self.take_sample()
.into_iter()
.enumerate()
.try_for_each(|(hll_shard, val)| {
enc.write_metric_value(
name.by_ref(),
labels.by_ref().compose_with(HllShardLabel {
hll_shard: hll_shard as i64,
}),
MetricValue::Int(val as i64),
)
})
} }
if desc.variable_labels.is_empty() {
return Ok(desc.const_label_pairs.clone());
}
let mut label_pairs = Vec::with_capacity(total_len);
for (i, n) in desc.variable_labels.iter().enumerate() {
let mut label_pair = proto::LabelPair::default();
label_pair.set_name(n.clone());
label_pair.set_value(label_values[i].to_owned());
label_pairs.push(label_pair);
}
for label_pair in &desc.const_label_pairs {
label_pairs.push(label_pair.clone());
}
label_pairs.sort();
Ok(label_pairs)
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::collections::HashSet; use std::collections::HashSet;
use measured::{label::StaticLabelSet, FixedCardinalityLabel}; use prometheus::{proto, Opts};
use rand::{rngs::StdRng, Rng, SeedableRng}; use rand::{rngs::StdRng, Rng, SeedableRng};
use rand_distr::{Distribution, Zipf}; use rand_distr::{Distribution, Zipf};
use crate::HyperLogLogVec; use crate::HyperLogLogVec;
#[derive(FixedCardinalityLabel, Clone, Copy)] fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
#[label(singleton = "x")] let mut metrics = vec![];
enum Label { hll.core
A, .children
B, .read()
.unwrap()
.values()
.for_each(|c| c.core.collect_into(&mut metrics));
metrics
} }
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
// cannot go through the `hll.collect_family_into` interface yet...
// need to see if I can fix the conflicting impls problem in measured.
(
hll.get_metric(hll.with_labels(Label::A)).take_sample(),
hll.get_metric(hll.with_labels(Label::B)).take_sample(),
)
}
fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
let mut buckets = [0.0; 32]; let mut buckets = [0.0; 32];
for &sample in samples { for metric in metrics.chunks_exact(32) {
for (i, m) in sample.into_iter().enumerate() { if filter(&metric[0]) {
buckets[i] = f64::max(buckets[i], m as f64); for (i, m) in metric.iter().enumerate() {
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
}
} }
} }
@@ -238,7 +437,7 @@ mod tests {
} }
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) { fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new(); let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist); let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
let mut set_a = HashSet::new(); let mut set_a = HashSet::new();
@@ -246,20 +445,18 @@ mod tests {
for x in iter.by_ref().take(n) { for x in iter.by_ref().take(n) {
set_a.insert(x.to_bits()); set_a.insert(x.to_bits());
hll.get_metric(hll.with_labels(Label::A)) hll.with_label_values(&["a"]).measure(&x.to_bits());
.measure(&x.to_bits());
} }
for x in iter.by_ref().take(n) { for x in iter.by_ref().take(n) {
set_b.insert(x.to_bits()); set_b.insert(x.to_bits());
hll.get_metric(hll.with_labels(Label::B)) hll.with_label_values(&["b"]).measure(&x.to_bits());
.measure(&x.to_bits());
} }
let merge = &set_a | &set_b; let merge = &set_a | &set_b;
let (a, b) = collect(&hll); let metrics = collect(&hll);
let len = get_cardinality(&[a, b]); let len = get_cardinality(&metrics, |_| true);
let len_a = get_cardinality(&[a]); let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
let len_b = get_cardinality(&[b]); let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b]) ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
} }

View File

@@ -4,17 +4,6 @@
//! a default registry. //! a default registry.
#![deny(clippy::undocumented_unsafe_blocks)] #![deny(clippy::undocumented_unsafe_blocks)]
use measured::{
label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
metric::{
counter::CounterState,
gauge::GaugeState,
group::{Encoding, MetricValue},
name::{MetricName, MetricNameEncoder},
MetricEncoding, MetricFamilyEncoding,
},
FixedCardinalityLabel, LabelGroup, MetricGroup,
};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use prometheus::core::{ use prometheus::core::{
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
@@ -22,7 +11,6 @@ use prometheus::core::{
pub use prometheus::opts; pub use prometheus::opts;
pub use prometheus::register; pub use prometheus::register;
pub use prometheus::Error; pub use prometheus::Error;
use prometheus::Registry;
pub use prometheus::{core, default_registry, proto}; pub use prometheus::{core, default_registry, proto};
pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{exponential_buckets, linear_buckets};
pub use prometheus::{register_counter_vec, Counter, CounterVec}; pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -35,12 +23,13 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
pub use prometheus::{register_int_gauge, IntGauge}; pub use prometheus::{register_int_gauge, IntGauge};
pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
pub use prometheus::{Encoder, TextEncoder}; pub use prometheus::{Encoder, TextEncoder};
use prometheus::{Registry, Result};
pub mod launch_timestamp; pub mod launch_timestamp;
mod wrappers; mod wrappers;
pub use wrappers::{CountedReader, CountedWriter}; pub use wrappers::{CountedReader, CountedWriter};
mod hll; mod hll;
pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec}; pub use hll::{HyperLogLog, HyperLogLogVec};
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
pub mod more_process_metrics; pub mod more_process_metrics;
@@ -70,7 +59,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`. /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
/// while holding the lock. /// while holding the lock.
pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> { pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
INTERNAL_REGISTRY.register(c) INTERNAL_REGISTRY.register(c)
} }
@@ -107,127 +96,6 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
]; ];
pub struct BuildInfo {
pub revision: &'static str,
pub build_tag: &'static str,
}
// todo: allow label group without the set
impl LabelGroup for BuildInfo {
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
const REVISION: &LabelName = LabelName::from_str("revision");
v.write_value(REVISION, &self.revision);
const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
v.write_value(BUILD_TAG, &self.build_tag);
}
}
impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
where
GaugeState: MetricEncoding<T>,
{
fn collect_family_into(
&self,
name: impl measured::metric::name::MetricNameEncoder,
enc: &mut T,
) -> Result<(), T::Err> {
enc.write_help(&name, "Build/version information")?;
GaugeState::write_type(&name, enc)?;
GaugeState {
count: std::sync::atomic::AtomicI64::new(1),
}
.collect_into(&(), self, name, enc)
}
}
#[derive(MetricGroup)]
#[metric(new(build_info: BuildInfo))]
pub struct NeonMetrics {
#[cfg(target_os = "linux")]
#[metric(namespace = "process")]
#[metric(init = measured_process::ProcessCollector::for_self())]
process: measured_process::ProcessCollector,
#[metric(namespace = "libmetrics")]
#[metric(init = LibMetrics::new(build_info))]
libmetrics: LibMetrics,
}
#[derive(MetricGroup)]
#[metric(new(build_info: BuildInfo))]
pub struct LibMetrics {
#[metric(init = build_info)]
build_info: BuildInfo,
#[metric(flatten)]
rusage: Rusage,
serve_count: CollectionCounter,
}
fn write_gauge<Enc: Encoding>(
x: i64,
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Enc,
) -> Result<(), Enc::Err> {
enc.write_metric_value(name, labels, MetricValue::Int(x))
}
#[derive(Default)]
struct Rusage;
#[derive(FixedCardinalityLabel, Clone, Copy)]
#[label(singleton = "io_operation")]
enum IoOp {
Read,
Write,
}
impl<T: Encoding> MetricGroup<T> for Rusage
where
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
let ru = get_rusage_stats();
enc.write_help(
DISK_IO,
"Bytes written and read from disk, grouped by the operation (read|write)",
)?;
GaugeState::write_type(DISK_IO, enc)?;
write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
GaugeState::write_type(MAXRSS, enc)?;
write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
Ok(())
}
}
#[derive(Default)]
struct CollectionCounter(CounterState);
impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
where
CounterState: MetricEncoding<T>,
{
fn collect_family_into(
&self,
name: impl measured::metric::name::MetricNameEncoder,
enc: &mut T,
) -> Result<(), T::Err> {
self.0.inc();
enc.write_help(&name, "Number of metric requests made")?;
self.0.collect_into(&(), NoLabels, name, enc)
}
}
pub fn set_build_info_metric(revision: &str, build_tag: &str) { pub fn set_build_info_metric(revision: &str, build_tag: &str) {
let metric = register_int_gauge_vec!( let metric = register_int_gauge_vec!(
"libmetrics_build_info", "libmetrics_build_info",
@@ -237,7 +105,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
.expect("Failed to register build info metric"); .expect("Failed to register build info metric");
metric.with_label_values(&[revision, build_tag]).set(1); metric.with_label_values(&[revision, build_tag]).set(1);
} }
const BYTES_IN_BLOCK: i64 = 512;
// Records I/O stats in a "cross-platform" way. // Records I/O stats in a "cross-platform" way.
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats. // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
@@ -250,6 +117,7 @@ const BYTES_IN_BLOCK: i64 = 512;
fn update_rusage_metrics() { fn update_rusage_metrics() {
let rusage_stats = get_rusage_stats(); let rusage_stats = get_rusage_stats();
const BYTES_IN_BLOCK: i64 = 512;
DISK_IO_BYTES DISK_IO_BYTES
.with_label_values(&["read"]) .with_label_values(&["read"])
.set(rusage_stats.ru_inblock * BYTES_IN_BLOCK); .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
@@ -283,7 +151,6 @@ macro_rules! register_int_counter_pair_vec {
} }
}}; }};
} }
/// Create an [`IntCounterPair`] and registers to default registry. /// Create an [`IntCounterPair`] and registers to default registry.
#[macro_export(local_inner_macros)] #[macro_export(local_inner_macros)]
macro_rules! register_int_counter_pair { macro_rules! register_int_counter_pair {
@@ -321,10 +188,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
/// ///
/// An error is returned if the number of label values is not the same as the /// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc. /// number of VariableLabels in Desc.
pub fn get_metric_with_label_values( pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
&self,
vals: &[&str],
) -> prometheus::Result<GenericCounterPair<P>> {
Ok(GenericCounterPair { Ok(GenericCounterPair {
inc: self.inc.get_metric_with_label_values(vals)?, inc: self.inc.get_metric_with_label_values(vals)?,
dec: self.dec.get_metric_with_label_values(vals)?, dec: self.dec.get_metric_with_label_values(vals)?,
@@ -337,7 +201,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
self.get_metric_with_label_values(vals).unwrap() self.get_metric_with_label_values(vals).unwrap()
} }
pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) { pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
res[0] = self.inc.remove_label_values(vals); res[0] = self.inc.remove_label_values(vals);
res[1] = self.dec.remove_label_values(vals); res[1] = self.dec.remove_label_values(vals);
} }
@@ -421,171 +285,3 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop /// A guard for [`IntCounterPair`] that will decrement the gauge on drop
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>; pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
pub trait CounterPairAssoc {
const INC_NAME: &'static MetricName;
const DEC_NAME: &'static MetricName;
const INC_HELP: &'static str;
const DEC_HELP: &'static str;
type LabelGroupSet: LabelGroupSet;
}
pub struct CounterPairVec<A: CounterPairAssoc> {
vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
}
impl<A: CounterPairAssoc> Default for CounterPairVec<A>
where
A::LabelGroupSet: Default,
{
fn default() -> Self {
Self {
vec: Default::default(),
}
}
}
impl<A: CounterPairAssoc> CounterPairVec<A> {
pub fn guard(
&self,
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
) -> MeasuredCounterPairGuard<'_, A> {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).inc.inc();
MeasuredCounterPairGuard { vec: &self.vec, id }
}
pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).inc.inc();
}
pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).dec.inc();
}
pub fn remove_metric(
&self,
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
) -> Option<MeasuredCounterPairState> {
let id = self.vec.with_labels(labels);
self.vec.remove_metric(id)
}
}
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
where
T: ::measured::metric::group::Encoding,
A: CounterPairAssoc,
::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
// write decrement first to avoid a race condition where inc - dec < 0
T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
self.vec
.collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
self.vec
.collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
Ok(())
}
}
#[derive(MetricGroup, Default)]
pub struct MeasuredCounterPairState {
pub inc: CounterState,
pub dec: CounterState,
}
impl measured::metric::MetricType for MeasuredCounterPairState {
type Metadata = ();
}
pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
id: measured::metric::LabelId<A::LabelGroupSet>,
}
impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
fn drop(&mut self) {
self.vec.get_metric(self.id).dec.inc();
}
}
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
struct Inc<T>(T);
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
struct Dec<T>(T);
impl<T: Encoding> Encoding for Inc<T> {
type Err = T::Err;
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
self.0.write_help(name, help)
}
fn write_metric_value(
&mut self,
name: impl MetricNameEncoder,
labels: impl LabelGroup,
value: MetricValue,
) -> Result<(), Self::Err> {
self.0.write_metric_value(name, labels, value)
}
}
impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
where
CounterState: MetricEncoding<T>,
{
fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
CounterState::write_type(name, &mut enc.0)
}
fn collect_into(
&self,
metadata: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Inc<T>,
) -> Result<(), T::Err> {
self.inc.collect_into(metadata, labels, name, &mut enc.0)
}
}
impl<T: Encoding> Encoding for Dec<T> {
type Err = T::Err;
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
self.0.write_help(name, help)
}
fn write_metric_value(
&mut self,
name: impl MetricNameEncoder,
labels: impl LabelGroup,
value: MetricValue,
) -> Result<(), Self::Err> {
self.0.write_metric_value(name, labels, value)
}
}
/// Write the dec counter to the encoder
impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
where
CounterState: MetricEncoding<T>,
{
fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
CounterState::write_type(name, &mut enc.0)
}
fn collect_into(
&self,
metadata: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Dec<T>,
) -> Result<(), T::Err> {
self.dec.collect_into(metadata, labels, name, &mut enc.0)
}
}

View File

@@ -20,7 +20,6 @@ use utils::{
history_buffer::HistoryBufferWithDropCounter, history_buffer::HistoryBufferWithDropCounter,
id::{NodeId, TenantId, TimelineId}, id::{NodeId, TenantId, TimelineId},
lsn::Lsn, lsn::Lsn,
serde_system_time,
}; };
use crate::controller_api::PlacementPolicy; use crate::controller_api::PlacementPolicy;
@@ -747,18 +746,10 @@ pub struct TimelineGcRequest {
pub gc_horizon: Option<u64>, pub gc_horizon: Option<u64>,
} }
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalRedoManagerProcessStatus {
pub pid: u32,
/// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
/// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
pub kind: Cow<'static, str>,
}
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalRedoManagerStatus { pub struct WalRedoManagerStatus {
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>, pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
pub process: Option<WalRedoManagerProcessStatus>, pub pid: Option<u32>,
} }
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
@@ -767,7 +758,11 @@ pub struct WalRedoManagerStatus {
#[derive(Default, Debug, Serialize, Deserialize, Clone)] #[derive(Default, Debug, Serialize, Deserialize, Clone)]
pub struct SecondaryProgress { pub struct SecondaryProgress {
/// The remote storage LastModified time of the heatmap object we last downloaded. /// The remote storage LastModified time of the heatmap object we last downloaded.
pub heatmap_mtime: Option<serde_system_time::SystemTime>, #[serde(
serialize_with = "opt_ser_rfc3339_millis",
deserialize_with = "opt_deser_rfc3339_millis"
)]
pub heatmap_mtime: Option<SystemTime>,
/// The number of layers currently on-disk /// The number of layers currently on-disk
pub layers_downloaded: usize, pub layers_downloaded: usize,
@@ -780,6 +775,29 @@ pub struct SecondaryProgress {
pub bytes_total: u64, pub bytes_total: u64,
} }
fn opt_ser_rfc3339_millis<S: serde::Serializer>(
ts: &Option<SystemTime>,
serializer: S,
) -> Result<S::Ok, S::Error> {
match ts {
Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
None => serializer.serialize_none(),
}
}
fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
match s {
None => Ok(None),
Some(s) => humantime::parse_rfc3339(&s)
.map_err(serde::de::Error::custom)
.map(Some),
}
}
pub mod virtual_file { pub mod virtual_file {
#[derive( #[derive(
Copy, Copy,

View File

@@ -1,4 +1,4 @@
use utils::serde_system_time::SystemTime; use std::time::SystemTime;
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
/// the next tenant. /// the next tenant.
@@ -21,9 +21,28 @@ pub struct PageserverUtilization {
/// When was this snapshot captured, pageserver local time. /// When was this snapshot captured, pageserver local time.
/// ///
/// Use millis to give confidence that the value is regenerated often enough. /// Use millis to give confidence that the value is regenerated often enough.
#[serde(
serialize_with = "ser_rfc3339_millis",
deserialize_with = "deser_rfc3339_millis"
)]
pub captured_at: SystemTime, pub captured_at: SystemTime,
} }
fn ser_rfc3339_millis<S: serde::Serializer>(
ts: &SystemTime,
serializer: S,
) -> Result<S::Ok, S::Error> {
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
}
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
}
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
/// ///
/// Instead of newtype, use this because a newtype would get require handling deserializing values /// Instead of newtype, use this because a newtype would get require handling deserializing values
@@ -50,9 +69,7 @@ mod tests {
disk_usage_bytes: u64::MAX, disk_usage_bytes: u64::MAX,
free_space_bytes: 0, free_space_bytes: 0,
utilization_score: u64::MAX, utilization_score: u64::MAX,
captured_at: SystemTime( captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
),
}; };
let s = serde_json::to_string(&doc).unwrap(); let s = serde_json::to_string(&doc).unwrap();

View File

@@ -8,89 +8,12 @@ use hex::FromHex;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use utils::id::TenantId; use utils::id::TenantId;
/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
///
/// This module contains a variety of types used to represent the concept of sharding
/// a Neon tenant across multiple physical shards. Since there are quite a few of these,
/// we provide an summary here.
///
/// Types used to describe shards:
/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
/// which identifies a tenant which is not shard-aware. This means its storage paths do not include
/// a shard suffix.
/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
/// without the tenant ID. This is useful for things that are implicitly scoped to a particular
/// tenant, such as layer files.
/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
/// four hex digits. An unsharded tenant is `0000`.
/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
///
/// Types used to describe the parameters for data distribution in a sharded tenant:
/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
/// multiple shards. Its value is given in 8kiB pages.
/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
/// always zero: this is provided for future upgrades that might introduce different
/// data distribution schemes.
///
/// Examples:
/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
/// and their slugs are 0004, 0104, 0204, and 0304.
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardNumber(pub u8); pub struct ShardNumber(pub u8);
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardCount(u8); pub struct ShardCount(u8);
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
/// when we need to know which shard we're dealing with, but do not need to know the full
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
/// the fully qualified TenantShardId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct ShardIndex {
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
/// and to check whether that [`ShardNumber`] is the same as the current shard.
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardIdentity {
pub number: ShardNumber,
pub count: ShardCount,
pub stripe_size: ShardStripeSize,
layout: ShardLayout,
}
/// Formatting helper, for generating the `shard_id` label in traces.
struct ShardSlug<'a>(&'a TenantShardId);
/// TenantShardId globally identifies a particular shard in a particular tenant.
///
/// These are written as `<TenantId>-<ShardSlug>`, for example:
/// # The second shard in a two-shard tenant
/// 072f1291a5310026820b2fe4b2968934-0102
///
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
///
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
/// decoded as a TenantShardId, and when re-encoded it will be parseable
/// as a TenantId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct TenantShardId {
pub tenant_id: TenantId,
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl ShardCount { impl ShardCount {
pub const MAX: Self = Self(u8::MAX); pub const MAX: Self = Self(u8::MAX);
@@ -115,7 +38,6 @@ impl ShardCount {
self.0 self.0
} }
///
pub fn is_unsharded(&self) -> bool { pub fn is_unsharded(&self) -> bool {
self.0 == 0 self.0 == 0
} }
@@ -131,6 +53,33 @@ impl ShardNumber {
pub const MAX: Self = Self(u8::MAX); pub const MAX: Self = Self(u8::MAX);
} }
/// TenantShardId identify the units of work for the Pageserver.
///
/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
///
/// # The second shard in a two-shard tenant
/// 072f1291a5310026820b2fe4b2968934-0102
///
/// Historically, tenants could not have multiple shards, and were identified
/// by TenantId. To support this, TenantShardId has a special legacy
/// mode where `shard_count` is equal to zero: this represents a single-sharded
/// tenant which should be written as a TenantId with no suffix.
///
/// The human-readable encoding of TenantShardId, such as used in API URLs,
/// is both forward and backward compatible: a legacy TenantId can be
/// decoded as a TenantShardId, and when re-encoded it will be parseable
/// as a TenantId.
///
/// Note that the binary encoding is _not_ backward compatible, because
/// at the time sharding is introduced, there are no existing binary structures
/// containing TenantId that we need to handle.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct TenantShardId {
pub tenant_id: TenantId,
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl TenantShardId { impl TenantShardId {
pub fn unsharded(tenant_id: TenantId) -> Self { pub fn unsharded(tenant_id: TenantId) -> Self {
Self { Self {
@@ -162,13 +111,10 @@ impl TenantShardId {
} }
/// Convenience for code that has special behavior on the 0th shard. /// Convenience for code that has special behavior on the 0th shard.
pub fn is_shard_zero(&self) -> bool { pub fn is_zero(&self) -> bool {
self.shard_number == ShardNumber(0) self.shard_number == ShardNumber(0)
} }
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool { pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
} }
@@ -204,6 +150,9 @@ impl TenantShardId {
} }
} }
/// Formatting helper
struct ShardSlug<'a>(&'a TenantShardId);
impl<'a> std::fmt::Display for ShardSlug<'a> { impl<'a> std::fmt::Display for ShardSlug<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!( write!(
@@ -273,6 +222,16 @@ impl From<[u8; 18]> for TenantShardId {
} }
} }
/// For use within the context of a particular tenant, when we need to know which
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
/// TenantShardId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct ShardIndex {
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl ShardIndex { impl ShardIndex {
pub fn new(number: ShardNumber, count: ShardCount) -> Self { pub fn new(number: ShardNumber, count: ShardCount) -> Self {
Self { Self {
@@ -287,9 +246,6 @@ impl ShardIndex {
} }
} }
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool { pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
} }
@@ -357,8 +313,6 @@ impl Serialize for TenantShardId {
if serializer.is_human_readable() { if serializer.is_human_readable() {
serializer.collect_str(self) serializer.collect_str(self)
} else { } else {
// Note: while human encoding of [`TenantShardId`] is backward and forward
// compatible, this binary encoding is not.
let mut packed: [u8; 18] = [0; 18]; let mut packed: [u8; 18] = [0; 18];
packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
packed[16] = self.shard_number.0; packed[16] = self.shard_number.0;
@@ -436,6 +390,16 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
/// Default stripe size in pages: 256MiB divided by 8kiB page size. /// Default stripe size in pages: 256MiB divided by 8kiB page size.
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
/// The ShardIdentity contains the information needed for one member of map
/// to resolve a key to a shard, and then check whether that shard is ==self.
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardIdentity {
pub number: ShardNumber,
pub count: ShardCount,
pub stripe_size: ShardStripeSize,
layout: ShardLayout,
}
#[derive(thiserror::Error, Debug, PartialEq, Eq)] #[derive(thiserror::Error, Debug, PartialEq, Eq)]
pub enum ShardConfigError { pub enum ShardConfigError {
#[error("Invalid shard count")] #[error("Invalid shard count")]
@@ -475,9 +439,6 @@ impl ShardIdentity {
} }
} }
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool { pub fn is_unsharded(&self) -> bool {
self.number == ShardNumber(0) && self.count == ShardCount(0) self.number == ShardNumber(0) && self.count == ShardCount(0)
} }
@@ -526,8 +487,6 @@ impl ShardIdentity {
} }
/// Return true if the key should be ingested by this shard /// Return true if the key should be ingested by this shard
///
/// Shards must ingest _at least_ keys which return true from this check.
pub fn is_key_local(&self, key: &Key) -> bool { pub fn is_key_local(&self, key: &Key) -> bool {
assert!(!self.is_broken()); assert!(!self.is_broken());
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) { if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
@@ -538,9 +497,7 @@ impl ShardIdentity {
} }
/// Return true if the key should be discarded if found in this shard's /// Return true if the key should be discarded if found in this shard's
/// data store, e.g. during compaction after a split. /// data store, e.g. during compaction after a split
///
/// Shards _may_ drop keys which return false here, but are not obliged to.
pub fn is_key_disposable(&self, key: &Key) -> bool { pub fn is_key_disposable(&self, key: &Key) -> bool {
if key_is_shard0(key) { if key_is_shard0(key) {
// Q: Why can't we dispose of shard0 content if we're not shard 0? // Q: Why can't we dispose of shard0 content if we're not shard 0?
@@ -566,7 +523,7 @@ impl ShardIdentity {
/// Convenience for checking if this identity is the 0th shard in a tenant, /// Convenience for checking if this identity is the 0th shard in a tenant,
/// for special cases on shard 0 such as ingesting relation sizes. /// for special cases on shard 0 such as ingesting relation sizes.
pub fn is_shard_zero(&self) -> bool { pub fn is_zero(&self) -> bool {
self.number == ShardNumber(0) self.number == ShardNumber(0)
} }
} }

View File

@@ -22,7 +22,6 @@ camino.workspace = true
chrono.workspace = true chrono.workspace = true
heapless.workspace = true heapless.workspace = true
hex = { workspace = true, features = ["serde"] } hex = { workspace = true, features = ["serde"] }
humantime.workspace = true
hyper = { workspace = true, features = ["full"] } hyper = { workspace = true, features = ["full"] }
fail.workspace = true fail.workspace = true
futures = { workspace = true} futures = { workspace = true}

View File

@@ -1,21 +0,0 @@
//! Wrapper around `std::env::var` for parsing environment variables.
use std::{fmt::Display, str::FromStr};
pub fn var<V, E>(varname: &str) -> Option<V>
where
V: FromStr<Err = E>,
E: Display,
{
match std::env::var(varname) {
Ok(s) => Some(
s.parse()
.map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
.unwrap(),
),
Err(std::env::VarError::NotPresent) => None,
Err(std::env::VarError::NotUnicode(_)) => {
panic!("env var {varname} is not unicode")
}
}
}

View File

@@ -63,7 +63,6 @@ pub mod measured_stream;
pub mod serde_percent; pub mod serde_percent;
pub mod serde_regex; pub mod serde_regex;
pub mod serde_system_time;
pub mod pageserver_feedback; pub mod pageserver_feedback;
@@ -90,10 +89,6 @@ pub mod yielding_loop;
pub mod zstd; pub mod zstd;
pub mod env;
pub mod poison;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
/// ///
/// we have several cases: /// we have several cases:

View File

@@ -1,121 +0,0 @@
//! Protect a piece of state from reuse after it is left in an inconsistent state.
//!
//! # Example
//!
//! ```
//! # tokio_test::block_on(async {
//! use utils::poison::Poison;
//! use std::time::Duration;
//!
//! struct State {
//! clean: bool,
//! }
//! let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
//!
//! let mut mutex_guard = state.lock().await;
//! let mut poison_guard = mutex_guard.check_and_arm()?;
//! let state = poison_guard.data_mut();
//! state.clean = false;
//! // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
//! tokio::time::sleep(Duration::from_secs(10)).await;
//! state.clean = true;
//! poison_guard.disarm();
//! # Ok::<(), utils::poison::Error>(())
//! # });
//! ```
use tracing::warn;
pub struct Poison<T> {
what: &'static str,
state: State,
data: T,
}
#[derive(Clone, Copy)]
enum State {
Clean,
Armed,
Poisoned { at: chrono::DateTime<chrono::Utc> },
}
impl<T> Poison<T> {
/// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
pub fn new(what: &'static str, data: T) -> Self {
Self {
what,
state: State::Clean,
data,
}
}
/// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
match self.state {
State::Clean => {
self.state = State::Armed;
Ok(Guard(self))
}
State::Armed => unreachable!("transient state"),
State::Poisoned { at } => Err(Error::Poisoned {
what: self.what,
at,
}),
}
}
}
/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
/// Once modifications are done, use [`Self::disarm`].
/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
pub struct Guard<'a, T>(&'a mut Poison<T>);
impl<'a, T> Guard<'a, T> {
pub fn data(&self) -> &T {
&self.0.data
}
pub fn data_mut(&mut self) -> &mut T {
&mut self.0.data
}
pub fn disarm(self) {
match self.0.state {
State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
State::Armed => {
self.0.state = State::Clean;
}
State::Poisoned { at } => {
unreachable!("we fail check_and_arm() if it's in that state: {at}")
}
}
}
}
impl<'a, T> Drop for Guard<'a, T> {
fn drop(&mut self) {
match self.0.state {
State::Clean => {
// set by disarm()
}
State::Armed => {
// still armed => poison it
let at = chrono::Utc::now();
self.0.state = State::Poisoned { at };
warn!(at=?at, "poisoning {}", self.0.what);
}
State::Poisoned { at } => {
unreachable!("we fail check_and_arm() if it's in that state: {at}")
}
}
}
}
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("poisoned at {at}: {what}")]
Poisoned {
what: &'static str,
at: chrono::DateTime<chrono::Utc>,
},
}

View File

@@ -1,55 +0,0 @@
//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
#[serde(transparent)]
pub struct SystemTime(
#[serde(
deserialize_with = "deser_rfc3339_millis",
serialize_with = "ser_rfc3339_millis"
)]
pub std::time::SystemTime,
);
fn ser_rfc3339_millis<S: serde::ser::Serializer>(
ts: &std::time::SystemTime,
serializer: S,
) -> Result<S::Ok, S::Error> {
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
}
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<std::time::SystemTime, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
}
#[cfg(test)]
mod tests {
use super::*;
/// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds.
fn to_millisecond_precision(time: SystemTime) -> SystemTime {
match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) {
Ok(duration) => {
let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis());
SystemTime(
std::time::SystemTime::UNIX_EPOCH
+ std::time::Duration::from_millis(total_millis),
)
}
Err(_) => time,
}
}
#[test]
fn test_serialize_deserialize() {
let input = SystemTime(std::time::SystemTime::now());
let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0));
let serialized = serde_json::to_string(&input).unwrap();
assert_eq!(expected_serialized, serialized);
let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap();
assert_eq!(to_millisecond_precision(input), deserialized);
}
}

View File

@@ -192,14 +192,6 @@ impl<T> OnceCell<T> {
} }
} }
/// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
/// initialized.
pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
let inner = self.inner.get_mut().unwrap();
inner.take_and_deinit()
}
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete. /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
pub fn initializer_count(&self) -> usize { pub fn initializer_count(&self) -> usize {
self.initializers.load(Ordering::Relaxed) self.initializers.load(Ordering::Relaxed)
@@ -254,23 +246,15 @@ impl<'a, T> Guard<'a, T> {
/// The permit will be on a semaphore part of the new internal value, and any following /// The permit will be on a semaphore part of the new internal value, and any following
/// [`OnceCell::get_or_init`] will wait on it to complete. /// [`OnceCell::get_or_init`] will wait on it to complete.
pub fn take_and_deinit(mut self) -> (T, InitPermit) { pub fn take_and_deinit(mut self) -> (T, InitPermit) {
self.0
.take_and_deinit()
.expect("guard is not created unless value has been initialized")
}
}
impl<T> Inner<T> {
pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
let value = self.value.take()?;
let mut swapped = Inner::default(); let mut swapped = Inner::default();
let sem = swapped.init_semaphore.clone(); let sem = swapped.init_semaphore.clone();
// acquire and forget right away, moving the control over to InitPermit // acquire and forget right away, moving the control over to InitPermit
sem.try_acquire().expect("we just created this").forget(); sem.try_acquire().expect("we just created this").forget();
let permit = InitPermit(sem); std::mem::swap(&mut *self.0, &mut swapped);
std::mem::swap(self, &mut swapped); swapped
Some((value, permit)) .value
.map(|v| (v, InitPermit(sem)))
.expect("guard is not created unless value has been initialized")
} }
} }
@@ -279,13 +263,6 @@ impl<T> Inner<T> {
/// On drop, this type will return the permit. /// On drop, this type will return the permit.
pub struct InitPermit(Arc<tokio::sync::Semaphore>); pub struct InitPermit(Arc<tokio::sync::Semaphore>);
impl std::fmt::Debug for InitPermit {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let ptr = Arc::as_ptr(&self.0) as *const ();
f.debug_tuple("InitPermit").field(&ptr).finish()
}
}
impl Drop for InitPermit { impl Drop for InitPermit {
fn drop(&mut self) { fn drop(&mut self) {
assert_eq!( assert_eq!(
@@ -582,22 +559,4 @@ mod tests {
assert_eq!(*target.get().unwrap(), 11); assert_eq!(*target.get().unwrap(), 11);
} }
#[tokio::test]
async fn take_and_deinit_on_mut() {
use std::convert::Infallible;
let mut target = OnceCell::<u32>::default();
assert!(target.take_and_deinit().is_none());
target
.get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
.await
.unwrap();
let again = target.take_and_deinit();
assert!(matches!(again, Some((42, _))), "{again:?}");
assert!(target.take_and_deinit().is_none());
}
} }

View File

@@ -27,50 +27,30 @@
//! //!
//! # Reference Numbers //! # Reference Numbers
//! //!
//! 2024-04-15 on i3en.3xlarge //! 2024-04-04 on i3en.3xlarge
//! //!
//! ```text //! ```text
//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs] //! short/1 time: [25.925 µs 26.060 µs 26.209 µs]
//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs] //! short/2 time: [31.277 µs 31.483 µs 31.722 µs]
//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs] //! short/4 time: [45.496 µs 45.831 µs 46.182 µs]
//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs] //! short/8 time: [84.298 µs 84.920 µs 85.566 µs]
//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs] //! short/16 time: [185.04 µs 186.41 µs 187.88 µs]
//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs] //! short/32 time: [385.01 µs 386.77 µs 388.70 µs]
//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs] //! short/64 time: [770.24 µs 773.04 µs 776.04 µs]
//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms] //! short/128 time: [1.5017 ms 1.5064 ms 1.5113 ms]
//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs] //! medium/1 time: [106.65 µs 107.20 µs 107.85 µs]
//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs] //! medium/2 time: [153.28 µs 154.24 µs 155.56 µs]
//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs] //! medium/4 time: [325.67 µs 327.01 µs 328.71 µs]
//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs] //! medium/8 time: [646.82 µs 650.17 µs 653.91 µs]
//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms] //! medium/16 time: [1.2645 ms 1.2701 ms 1.2762 ms]
//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms] //! medium/32 time: [2.4409 ms 2.4550 ms 2.4692 ms]
//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms] //! medium/64 time: [4.6814 ms 4.7114 ms 4.7408 ms]
//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms] //! medium/128 time: [8.7790 ms 8.9037 ms 9.0282 ms]
//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs]
//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs]
//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs]
//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs]
//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs]
//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs]
//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs]
//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms]
//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs]
//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs]
//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs]
//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs]
//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms]
//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms]
//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms]
//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms]
//! ``` //! ```
use bytes::{Buf, Bytes}; use bytes::{Buf, Bytes};
use criterion::{BenchmarkId, Criterion}; use criterion::{BenchmarkId, Criterion};
use pageserver::{ use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
config::PageServerConf,
walrecord::NeonWalRecord,
walredo::{PostgresRedoManager, ProcessKind},
};
use pageserver_api::{key::Key, shard::TenantShardId}; use pageserver_api::{key::Key, shard::TenantShardId};
use std::{ use std::{
sync::Arc, sync::Arc,
@@ -80,39 +60,33 @@ use tokio::{sync::Barrier, task::JoinSet};
use utils::{id::TenantId, lsn::Lsn}; use utils::{id::TenantId, lsn::Lsn};
fn bench(c: &mut Criterion) { fn bench(c: &mut Criterion) {
for process_kind in &[ProcessKind::Async, ProcessKind::Sync] { {
{ let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; for nclients in nclients {
for nclients in nclients { let mut group = c.benchmark_group("short");
let mut group = c.benchmark_group(format!("{process_kind}-short")); group.bench_with_input(
group.bench_with_input( BenchmarkId::from_parameter(nclients),
BenchmarkId::from_parameter(nclients), &nclients,
&nclients, |b, nclients| {
|b, nclients| { let redo_work = Arc::new(Request::short_input());
let redo_work = Arc::new(Request::short_input()); b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
b.iter_custom(|iters| { },
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients) );
});
},
);
}
} }
}
{ {
let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients { for nclients in nclients {
let mut group = c.benchmark_group(format!("{process_kind}-medium")); let mut group = c.benchmark_group("medium");
group.bench_with_input( group.bench_with_input(
BenchmarkId::from_parameter(nclients), BenchmarkId::from_parameter(nclients),
&nclients, &nclients,
|b, nclients| { |b, nclients| {
let redo_work = Arc::new(Request::medium_input()); let redo_work = Arc::new(Request::medium_input());
b.iter_custom(|iters| { b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients) },
}); );
},
);
}
} }
} }
} }
@@ -120,16 +94,10 @@ criterion::criterion_group!(benches, bench);
criterion::criterion_main!(benches); criterion::criterion_main!(benches);
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos. // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
fn bench_impl( fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
process_kind: ProcessKind,
redo_work: Arc<Request>,
n_redos: u64,
nclients: u64,
) -> Duration {
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap(); let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
conf.walredo_process_kind = process_kind;
let conf = Box::leak(Box::new(conf)); let conf = Box::leak(Box::new(conf));
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
@@ -145,40 +113,25 @@ fn bench_impl(
let manager = PostgresRedoManager::new(conf, tenant_shard_id); let manager = PostgresRedoManager::new(conf, tenant_shard_id);
let manager = Arc::new(manager); let manager = Arc::new(manager);
// divide the amount of work equally among the clients.
let nredos_per_client = n_redos / nclients;
for _ in 0..nclients { for _ in 0..nclients {
rt.block_on(async { rt.block_on(async {
tasks.spawn(client( tasks.spawn(client(
Arc::clone(&manager), Arc::clone(&manager),
Arc::clone(&start), Arc::clone(&start),
Arc::clone(&redo_work), Arc::clone(&redo_work),
nredos_per_client, // divide the amount of work equally among the clients
n_redos / nclients,
)) ))
}); });
} }
let elapsed = rt.block_on(async move { rt.block_on(async move {
let mut total_wallclock_time = Duration::ZERO; let mut total_wallclock_time = std::time::Duration::from_millis(0);
while let Some(res) = tasks.join_next().await { while let Some(res) = tasks.join_next().await {
total_wallclock_time += res.unwrap(); total_wallclock_time += res.unwrap();
} }
total_wallclock_time total_wallclock_time
}); })
// consistency check to ensure process kind setting worked
if nredos_per_client > 0 {
assert_eq!(
manager
.status()
.process
.map(|p| p.kind)
.expect("the benchmark work causes a walredo process to be spawned"),
std::borrow::Cow::Borrowed(process_kind.into())
);
}
elapsed
} }
async fn client( async fn client(

View File

@@ -11,6 +11,7 @@ default = []
anyhow.workspace = true anyhow.workspace = true
async-compression.workspace = true async-compression.workspace = true
async-stream.workspace = true async-stream.workspace = true
async-trait.workspace = true
byteorder.workspace = true byteorder.workspace = true
bytes.workspace = true bytes.workspace = true
chrono = { workspace = true, features = ["serde"] } chrono = { workspace = true, features = ["serde"] }

View File

@@ -180,7 +180,7 @@ where
match top.deref_mut() { match top.deref_mut() {
LazyLoadLayer::Unloaded(ref mut l) => { LazyLoadLayer::Unloaded(ref mut l) => {
let fut = l.load_keys(this.ctx); let fut = l.load_keys(this.ctx);
this.load_future.set(Some(Box::pin(fut))); this.load_future.set(Some(fut));
continue; continue;
} }
LazyLoadLayer::Loaded(ref mut entries) => { LazyLoadLayer::Loaded(ref mut entries) => {

View File

@@ -3,6 +3,7 @@
//! //!
//! All the heavy lifting is done by the create_image and create_delta //! All the heavy lifting is done by the create_image and create_delta
//! functions that the implementor provides. //! functions that the implementor provides.
use async_trait::async_trait;
use futures::Future; use futures::Future;
use pageserver_api::{key::Key, keyspace::key_range_size}; use pageserver_api::{key::Key, keyspace::key_range_size};
use std::ops::Range; use std::ops::Range;
@@ -140,16 +141,18 @@ pub trait CompactionLayer<K: CompactionKey + ?Sized> {
fn is_delta(&self) -> bool; fn is_delta(&self) -> bool;
} }
#[async_trait]
pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> { pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key> type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
where where
Self: 'a; Self: 'a;
/// Return all keys in this delta layer. /// Return all keys in this delta layer.
fn load_keys<'a>( async fn load_keys<'a>(
&self, &self,
ctx: &E::RequestContext, ctx: &E::RequestContext,
) -> impl Future<Output = anyhow::Result<Vec<Self::DeltaEntry<'_>>>> + Send; ) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
} }
pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {} pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}

View File

@@ -2,6 +2,7 @@ mod draw;
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
use async_trait::async_trait;
use futures::StreamExt; use futures::StreamExt;
use rand::Rng; use rand::Rng;
use tracing::info; use tracing::info;
@@ -138,6 +139,7 @@ impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
} }
} }
#[async_trait]
impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> { impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
type DeltaEntry<'a> = MockRecord; type DeltaEntry<'a> = MockRecord;

View File

@@ -12,14 +12,9 @@ bytes.workspace = true
camino.workspace = true camino.workspace = true
clap = { workspace = true, features = ["string"] } clap = { workspace = true, features = ["string"] }
git-version.workspace = true git-version.workspace = true
humantime.workspace = true
pageserver = { path = ".." } pageserver = { path = ".." }
pageserver_api.workspace = true
remote_storage = { path = "../../libs/remote_storage" }
postgres_ffi.workspace = true postgres_ffi.workspace = true
tokio.workspace = true tokio.workspace = true
tokio-util.workspace = true
toml_edit.workspace = true
utils.workspace = true utils.workspace = true
svg_fmt.workspace = true svg_fmt.workspace = true
workspace_hack.workspace = true workspace_hack.workspace = true

View File

@@ -9,11 +9,6 @@ mod index_part;
mod layer_map_analyzer; mod layer_map_analyzer;
mod layers; mod layers;
use std::{
str::FromStr,
time::{Duration, SystemTime},
};
use camino::{Utf8Path, Utf8PathBuf}; use camino::{Utf8Path, Utf8PathBuf};
use clap::{Parser, Subcommand}; use clap::{Parser, Subcommand};
use index_part::IndexPartCmd; use index_part::IndexPartCmd;
@@ -25,16 +20,8 @@ use pageserver::{
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
virtual_file, virtual_file,
}; };
use pageserver_api::shard::TenantShardId;
use postgres_ffi::ControlFileData; use postgres_ffi::ControlFileData;
use remote_storage::{RemotePath, RemoteStorageConfig}; use utils::{lsn::Lsn, project_git_version};
use tokio_util::sync::CancellationToken;
use utils::{
id::TimelineId,
logging::{self, LogFormat, TracingErrorLayerEnablement},
lsn::Lsn,
project_git_version,
};
project_git_version!(GIT_VERSION); project_git_version!(GIT_VERSION);
@@ -56,7 +43,6 @@ enum Commands {
#[command(subcommand)] #[command(subcommand)]
IndexPart(IndexPartCmd), IndexPart(IndexPartCmd),
PrintLayerFile(PrintLayerFileCmd), PrintLayerFile(PrintLayerFileCmd),
TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd),
DrawTimeline {}, DrawTimeline {},
AnalyzeLayerMap(AnalyzeLayerMapCmd), AnalyzeLayerMap(AnalyzeLayerMapCmd),
#[command(subcommand)] #[command(subcommand)]
@@ -82,26 +68,6 @@ struct PrintLayerFileCmd {
path: Utf8PathBuf, path: Utf8PathBuf,
} }
/// Roll back the time for the specified prefix using S3 history.
///
/// The command is fairly low level and powerful. Validation is only very light,
/// so it is more powerful, and thus potentially more dangerous.
#[derive(Parser)]
struct TimeTravelRemotePrefixCmd {
/// A configuration string for the remote_storage configuration.
///
/// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }`
config_toml_str: String,
/// remote prefix to time travel recover. For safety reasons, we require it to contain
/// a timeline or tenant ID in the prefix.
prefix: String,
/// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy.
travel_to: String,
/// Timestamp of the start of the operation, must be after any changes we want to roll back and after.
/// You can use a few seconds before invoking the command. Same format as `travel_to`.
done_if_after: Option<String>,
}
#[derive(Parser)] #[derive(Parser)]
struct AnalyzeLayerMapCmd { struct AnalyzeLayerMapCmd {
/// Pageserver data path /// Pageserver data path
@@ -112,14 +78,6 @@ struct AnalyzeLayerMapCmd {
#[tokio::main] #[tokio::main]
async fn main() -> anyhow::Result<()> { async fn main() -> anyhow::Result<()> {
logging::init(
LogFormat::Plain,
TracingErrorLayerEnablement::EnableWithRustLogFilter,
logging::Output::Stdout,
)?;
logging::replace_panic_hook_with_tracing_panic_hook().forget();
let cli = CliOpts::parse(); let cli = CliOpts::parse();
match cli.command { match cli.command {
@@ -147,42 +105,6 @@ async fn main() -> anyhow::Result<()> {
print_layerfile(&cmd.path).await?; print_layerfile(&cmd.path).await?;
} }
} }
Commands::TimeTravelRemotePrefix(cmd) => {
let timestamp = humantime::parse_rfc3339(&cmd.travel_to)
.map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?;
let done_if_after = if let Some(done_if_after) = &cmd.done_if_after {
humantime::parse_rfc3339(done_if_after).map_err(|_e| {
anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after)
})?
} else {
const SAFETY_MARGIN: Duration = Duration::from_secs(3);
tokio::time::sleep(SAFETY_MARGIN).await;
// Convert to string representation and back to get rid of sub-second values
let done_if_after = SystemTime::now();
tokio::time::sleep(SAFETY_MARGIN).await;
done_if_after
};
let timestamp = strip_subsecond(timestamp);
let done_if_after = strip_subsecond(done_if_after);
let Some(prefix) = validate_prefix(&cmd.prefix) else {
println!("specified prefix '{}' failed validation", cmd.prefix);
return Ok(());
};
let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
let toml_item = toml_document
.get("remote_storage")
.expect("need remote_storage");
let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
let cancel = CancellationToken::new();
storage
.unwrap()
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
.await?;
}
}; };
Ok(()) Ok(())
} }
@@ -263,89 +185,3 @@ fn handle_metadata(
Ok(()) Ok(())
} }
/// Ensures that the given S3 prefix is sufficiently constrained.
/// The command is very risky already and we don't want to expose something
/// that allows usually unintentional and quite catastrophic time travel of
/// an entire bucket, which would be a major catastrophy and away
/// by only one character change (similar to "rm -r /home /username/foobar").
fn validate_prefix(prefix: &str) -> Option<RemotePath> {
if prefix.is_empty() {
// Empty prefix means we want to specify the *whole* bucket
return None;
}
let components = prefix.split('/').collect::<Vec<_>>();
let (last, components) = {
let last = components.last()?;
if last.is_empty() {
(
components.iter().nth_back(1)?,
&components[..(components.len() - 1)],
)
} else {
(last, &components[..])
}
};
'valid: {
if let Ok(_timeline_id) = TimelineId::from_str(last) {
// Ends in either a tenant or timeline ID
break 'valid;
}
if *last == "timelines" {
if let Some(before_last) = components.iter().nth_back(1) {
if let Ok(_tenant_id) = TenantShardId::from_str(before_last) {
// Has a valid tenant id
break 'valid;
}
}
}
return None;
}
RemotePath::from_string(prefix).ok()
}
fn strip_subsecond(timestamp: SystemTime) -> SystemTime {
let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string();
humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_validate_prefix() {
assert_eq!(validate_prefix(""), None);
assert_eq!(validate_prefix("/"), None);
#[track_caller]
fn assert_valid(prefix: &str) {
let remote_path = RemotePath::from_string(prefix).unwrap();
assert_eq!(validate_prefix(prefix), Some(remote_path));
}
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/");
// Path is not relative but absolute
assert_eq!(
validate_prefix(
"/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"
),
None
);
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/");
// Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix
assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None);
assert_eq!(validate_prefix("wal"), None);
assert_eq!(validate_prefix("/wal/"), None);
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001");
// Partial tenant ID
assert_eq!(
validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"),
None
);
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683");
assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
}
}

View File

@@ -285,7 +285,6 @@ fn start_pageserver(
)) ))
.unwrap(); .unwrap();
pageserver::preinitialize_metrics(); pageserver::preinitialize_metrics();
pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
// If any failpoints were set from FAILPOINTS environment variable, // If any failpoints were set from FAILPOINTS environment variable,
// print them to the log for debugging purposes // print them to the log for debugging purposes

View File

@@ -97,8 +97,6 @@ pub mod defaults {
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
/// ///
/// Default built-in configuration file. /// Default built-in configuration file.
/// ///
@@ -142,8 +140,6 @@ pub mod defaults {
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}' #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
[tenant_config] [tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -294,8 +290,6 @@ pub struct PageServerConf {
/// ///
/// Setting this to zero disables limits on total ephemeral layer size. /// Setting this to zero disables limits on total ephemeral layer size.
pub ephemeral_bytes_per_memory_kb: usize, pub ephemeral_bytes_per_memory_kb: usize,
pub walredo_process_kind: crate::walredo::ProcessKind,
} }
/// We do not want to store this in a PageServerConf because the latter may be logged /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -419,8 +413,6 @@ struct PageServerConfigBuilder {
validate_vectored_get: BuilderValue<bool>, validate_vectored_get: BuilderValue<bool>,
ephemeral_bytes_per_memory_kb: BuilderValue<usize>, ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
} }
impl PageServerConfigBuilder { impl PageServerConfigBuilder {
@@ -508,8 +500,6 @@ impl PageServerConfigBuilder {
)), )),
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
} }
} }
} }
@@ -693,10 +683,6 @@ impl PageServerConfigBuilder {
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
} }
pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
self.walredo_process_kind = BuilderValue::Set(value);
}
pub fn build(self) -> anyhow::Result<PageServerConf> { pub fn build(self) -> anyhow::Result<PageServerConf> {
let default = Self::default_values(); let default = Self::default_values();
@@ -753,7 +739,6 @@ impl PageServerConfigBuilder {
max_vectored_read_bytes, max_vectored_read_bytes,
validate_vectored_get, validate_vectored_get,
ephemeral_bytes_per_memory_kb, ephemeral_bytes_per_memory_kb,
walredo_process_kind,
} }
CUSTOM LOGIC CUSTOM LOGIC
{ {
@@ -1047,9 +1032,6 @@ impl PageServerConf {
"ephemeral_bytes_per_memory_kb" => { "ephemeral_bytes_per_memory_kb" => {
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
} }
"walredo_process_kind" => {
builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
}
_ => bail!("unrecognized pageserver option '{key}'"), _ => bail!("unrecognized pageserver option '{key}'"),
} }
} }
@@ -1132,7 +1114,6 @@ impl PageServerConf {
), ),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
} }
} }
} }
@@ -1370,8 +1351,7 @@ background_task_maximum_delay = '334 s'
.expect("Invalid default constant") .expect("Invalid default constant")
), ),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
}, },
"Correct defaults should be used when no config values are provided" "Correct defaults should be used when no config values are provided"
); );
@@ -1443,8 +1423,7 @@ background_task_maximum_delay = '334 s'
.expect("Invalid default constant") .expect("Invalid default constant")
), ),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
}, },
"Should be able to parse all basic config values correctly" "Should be able to parse all basic config values correctly"
); );

View File

@@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker(
continue; continue;
} }
if !tenant_shard_id.is_shard_zero() { if !tenant_shard_id.is_zero() {
// We only send consumption metrics from shard 0, so don't waste time calculating // We only send consumption metrics from shard 0, so don't waste time calculating
// synthetic size on other shards. // synthetic size on other shards.
continue; continue;

View File

@@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics(
}; };
let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move { let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
if state != TenantState::Active || !id.is_shard_zero() { if state != TenantState::Active || !id.is_zero() {
None None
} else { } else {
tenant_manager tenant_manager

View File

@@ -58,6 +58,24 @@ paths:
responses: responses:
"200": "200":
description: The reload completed successfully. description: The reload completed successfully.
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error (also hits if no keys were found)
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}: /v1/tenant/{tenant_id}:
parameters: parameters:
@@ -75,14 +93,62 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/TenantInfo" $ref: "#/components/schemas/TenantInfo"
"400":
description: Error when no tenant id found in path or no timeline id
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
delete: delete:
description: | description: |
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved. Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
404 means that deletion successfully finished" 404 means that deletion successfully finished"
responses: responses:
"400":
description: Error when no tenant id found in path
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"404": "404":
description: Tenant not found. This is the success path. description: Tenant not found
content: content:
application/json: application/json:
schema: schema:
@@ -99,6 +165,18 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/PreconditionFailedError" $ref: "#/components/schemas/PreconditionFailedError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/time_travel_remote_storage: /v1/tenant/{tenant_id}/time_travel_remote_storage:
parameters: parameters:
@@ -128,6 +206,36 @@ paths:
application/json: application/json:
schema: schema:
type: string type: string
"400":
description: Error when no tenant id found in path or invalid timestamp
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline: /v1/tenant/{tenant_id}/timeline:
parameters: parameters:
@@ -147,6 +255,36 @@ paths:
type: array type: array
items: items:
$ref: "#/components/schemas/TimelineInfo" $ref: "#/components/schemas/TimelineInfo"
"400":
description: Error when no tenant id found in path
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline/{timeline_id}: /v1/tenant/{tenant_id}/timeline/{timeline_id}:
@@ -171,12 +309,60 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/TimelineInfo" $ref: "#/components/schemas/TimelineInfo"
"400":
description: Error when no tenant id found in path or no timeline id
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
delete: delete:
description: "Attempts to delete specified timeline. 500 and 409 errors should be retried" description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
responses: responses:
"400":
description: Error when no tenant id found in path or no timeline id
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"404": "404":
description: Timeline not found. This is the success path. description: Timeline not found
content: content:
application/json: application/json:
schema: schema:
@@ -193,6 +379,18 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/PreconditionFailedError" $ref: "#/components/schemas/PreconditionFailedError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn: /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
parameters: parameters:
@@ -225,6 +423,36 @@ paths:
schema: schema:
type: string type: string
format: date-time format: date-time
"400":
description: Error when no tenant id found in path, no timeline id or invalid timestamp
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"404":
description: Timeline not found, or there is no timestamp information for the given lsn
content:
application/json:
schema:
$ref: "#/components/schemas/NotFoundError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp: /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
parameters: parameters:
@@ -256,6 +484,36 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/LsnByTimestampResponse" $ref: "#/components/schemas/LsnByTimestampResponse"
"400":
description: Error when no tenant id found in path, no timeline id or invalid timestamp
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc: /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
parameters: parameters:
@@ -279,6 +537,36 @@ paths:
application/json: application/json:
schema: schema:
type: string type: string
"400":
description: Error when no tenant id found in path, no timeline id or invalid timestamp
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_shard_id}/location_config: /v1/tenant/{tenant_shard_id}/location_config:
parameters: parameters:
- name: tenant_shard_id - name: tenant_shard_id
@@ -340,6 +628,24 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/TenantLocationConfigResponse" $ref: "#/components/schemas/TenantLocationConfigResponse"
"503":
description: Tenant's state cannot be changed right now. Wait a few seconds and retry.
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"409": "409":
description: | description: |
The tenant is already known to Pageserver in some way, The tenant is already known to Pageserver in some way,
@@ -356,6 +662,12 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/ConflictError" $ref: "#/components/schemas/ConflictError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/ignore: /v1/tenant/{tenant_id}/ignore:
parameters: parameters:
- name: tenant_id - name: tenant_id
@@ -372,6 +684,36 @@ paths:
responses: responses:
"200": "200":
description: Tenant ignored description: Tenant ignored
"400":
description: Error when no tenant id found in path parameters
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/load: /v1/tenant/{tenant_id}/load:
@@ -398,6 +740,36 @@ paths:
responses: responses:
"202": "202":
description: Tenant scheduled to load successfully description: Tenant scheduled to load successfully
"400":
description: Error when no tenant id found in path parameters
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive: /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
parameters: parameters:
@@ -418,6 +790,37 @@ paths:
responses: responses:
"202": "202":
description: Tenant scheduled to load successfully description: Tenant scheduled to load successfully
"404":
description: No tenant or timeline found for the specified ids
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/synthetic_size: /v1/tenant/{tenant_id}/synthetic_size:
parameters: parameters:
@@ -436,8 +839,31 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/SyntheticSizeResponse" $ref: "#/components/schemas/SyntheticSizeResponse"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
# This route has no handler. TODO: remove?
/v1/tenant/{tenant_id}/size: /v1/tenant/{tenant_id}/size:
parameters: parameters:
- name: tenant_id - name: tenant_id
@@ -519,6 +945,18 @@ paths:
responses: responses:
"200": "200":
description: Success description: Success
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_shard_id}/secondary/download: /v1/tenant/{tenant_shard_id}/secondary/download:
parameters: parameters:
@@ -549,6 +987,20 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/SecondaryProgress" $ref: "#/components/schemas/SecondaryProgress"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline/: /v1/tenant/{tenant_id}/timeline/:
parameters: parameters:
@@ -591,6 +1043,24 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/TimelineInfo" $ref: "#/components/schemas/TimelineInfo"
"400":
description: Malformed timeline create request
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"406": "406":
description: Permanently unsatisfiable request, don't retry. description: Permanently unsatisfiable request, don't retry.
content: content:
@@ -609,6 +1079,18 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/Error" $ref: "#/components/schemas/Error"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/: /v1/tenant/:
get: get:
@@ -622,6 +1104,30 @@ paths:
type: array type: array
items: items:
$ref: "#/components/schemas/TenantInfo" $ref: "#/components/schemas/TenantInfo"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
post: post:
description: | description: |
@@ -642,12 +1148,43 @@ paths:
application/json: application/json:
schema: schema:
type: string type: string
"400":
description: Malformed tenant create request
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"409": "409":
description: Tenant already exists, creation skipped description: Tenant already exists, creation skipped
content: content:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/ConflictError" $ref: "#/components/schemas/ConflictError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/config: /v1/tenant/config:
put: put:
@@ -669,6 +1206,36 @@ paths:
type: array type: array
items: items:
$ref: "#/components/schemas/TenantInfo" $ref: "#/components/schemas/TenantInfo"
"400":
description: Malformed tenant config request
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/config/: /v1/tenant/{tenant_id}/config/:
parameters: parameters:
@@ -688,6 +1255,42 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/TenantConfigResponse" $ref: "#/components/schemas/TenantConfigResponse"
"400":
description: Malformed get tenanant config request
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"404":
description: Tenand or timeline were not found
content:
application/json:
schema:
$ref: "#/components/schemas/NotFoundError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/utilization: /v1/utilization:
get: get:
@@ -701,6 +1304,12 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/PageserverUtilization" $ref: "#/components/schemas/PageserverUtilization"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
components: components:
securitySchemes: securitySchemes:

View File

@@ -457,12 +457,8 @@ async fn reload_auth_validation_keys_handler(
json_response(StatusCode::OK, ()) json_response(StatusCode::OK, ())
} }
Err(e) => { Err(e) => {
let err_msg = "Error reloading public keys";
warn!("Error reloading public keys from {key_path:?}: {e:}"); warn!("Error reloading public keys from {key_path:?}: {e:}");
json_response( json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
StatusCode::INTERNAL_SERVER_ERROR,
HttpErrorBody::from_msg(err_msg.to_string()),
)
} }
} }
} }
@@ -700,7 +696,7 @@ async fn get_lsn_by_timestamp_handler(
check_permission(&request, Some(tenant_shard_id.tenant_id))?; check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request); let state = get_state(&request);
if !tenant_shard_id.is_shard_zero() { if !tenant_shard_id.is_zero() {
// Requires SLRU contents, which are only stored on shard zero // Requires SLRU contents, which are only stored on shard zero
return Err(ApiError::BadRequest(anyhow!( return Err(ApiError::BadRequest(anyhow!(
"Size calculations are only available on shard zero" "Size calculations are only available on shard zero"
@@ -751,7 +747,7 @@ async fn get_timestamp_of_lsn_handler(
check_permission(&request, Some(tenant_shard_id.tenant_id))?; check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request); let state = get_state(&request);
if !tenant_shard_id.is_shard_zero() { if !tenant_shard_id.is_zero() {
// Requires SLRU contents, which are only stored on shard zero // Requires SLRU contents, which are only stored on shard zero
return Err(ApiError::BadRequest(anyhow!( return Err(ApiError::BadRequest(anyhow!(
"Size calculations are only available on shard zero" "Size calculations are only available on shard zero"
@@ -776,9 +772,7 @@ async fn get_timestamp_of_lsn_handler(
let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string(); let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
json_response(StatusCode::OK, time) json_response(StatusCode::OK, time)
} }
None => Err(ApiError::NotFound( None => json_response(StatusCode::NOT_FOUND, ()),
anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
)),
} }
} }
@@ -1092,7 +1086,7 @@ async fn tenant_size_handler(
let headers = request.headers(); let headers = request.headers();
let state = get_state(&request); let state = get_state(&request);
if !tenant_shard_id.is_shard_zero() { if !tenant_shard_id.is_zero() {
return Err(ApiError::BadRequest(anyhow!( return Err(ApiError::BadRequest(anyhow!(
"Size calculations are only available on shard zero" "Size calculations are only available on shard zero"
))); )));

View File

@@ -1518,8 +1518,7 @@ pub(crate) struct SecondaryModeMetrics {
pub(crate) download_heatmap: IntCounter, pub(crate) download_heatmap: IntCounter,
pub(crate) download_layer: IntCounter, pub(crate) download_layer: IntCounter,
} }
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| { pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
SecondaryModeMetrics {
upload_heatmap: register_int_counter!( upload_heatmap: register_int_counter!(
"pageserver_secondary_upload_heatmap", "pageserver_secondary_upload_heatmap",
"Number of heatmaps written to remote storage by attached tenants" "Number of heatmaps written to remote storage by attached tenants"
@@ -1537,7 +1536,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
.expect("failed to define a metric"), .expect("failed to define a metric"),
download_heatmap: register_int_counter!( download_heatmap: register_int_counter!(
"pageserver_secondary_download_heatmap", "pageserver_secondary_download_heatmap",
"Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed" "Number of downloads of heatmaps by secondary mode locations"
) )
.expect("failed to define a metric"), .expect("failed to define a metric"),
download_layer: register_int_counter!( download_layer: register_int_counter!(
@@ -1545,7 +1544,6 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
"Number of downloads of layers by secondary mode locations" "Number of downloads of layers by secondary mode locations"
) )
.expect("failed to define a metric"), .expect("failed to define a metric"),
}
}); });
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1821,29 +1819,6 @@ impl Default for WalRedoProcessCounters {
pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> = pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
Lazy::new(WalRedoProcessCounters::default); Lazy::new(WalRedoProcessCounters::default);
#[cfg(not(test))]
pub mod wal_redo {
use super::*;
static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
std::sync::Mutex::new(
register_uint_gauge_vec!(
"pageserver_wal_redo_process_kind",
"The configured process kind for walredo",
&["kind"],
)
.unwrap(),
)
});
pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
// use guard to avoid races around the next two steps
let guard = PROCESS_KIND.lock().unwrap();
guard.reset();
guard.with_label_values(&[&format!("{kind}")]).set(1);
}
}
/// Similar to `prometheus::HistogramTimer` but does not record on drop. /// Similar to `prometheus::HistogramTimer` but does not record on drop.
pub(crate) struct StorageTimeMetricsTimer { pub(crate) struct StorageTimeMetricsTimer {
metrics: StorageTimeMetrics, metrics: StorageTimeMetrics,
@@ -2114,7 +2089,7 @@ impl TimelineMetrics {
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
// Only shard zero deals in synthetic sizes // Only shard zero deals in synthetic sizes
if tenant_shard_id.is_shard_zero() { if tenant_shard_id.is_zero() {
let tid = tenant_shard_id.tenant_id.to_string(); let tid = tenant_shard_id.tenant_id.to_string();
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
} }
@@ -2125,7 +2100,6 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
use futures::Future; use futures::Future;
use pin_project_lite::pin_project; use pin_project_lite::pin_project;
use std::collections::HashMap; use std::collections::HashMap;
use std::num::NonZeroUsize;
use std::pin::Pin; use std::pin::Pin;
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use std::task::{Context, Poll}; use std::task::{Context, Poll};
@@ -2695,26 +2669,6 @@ pub(crate) mod disk_usage_based_eviction {
pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default); pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
} }
static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_tokio_executor_thread_configured_count",
"Total number of configued tokio executor threads in the process.
The `setup` label denotes whether we're running with multiple runtimes or a single runtime.",
&["setup"],
)
.unwrap()
});
pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
let _guard = SERIALIZE.lock().unwrap();
TOKIO_EXECUTOR_THREAD_COUNT.reset();
TOKIO_EXECUTOR_THREAD_COUNT
.get_metric_with_label_values(&[setup])
.unwrap()
.set(u64::try_from(num_threads.get()).unwrap());
}
pub fn preinitialize_metrics() { pub fn preinitialize_metrics() {
// Python tests need these and on some we do alerting. // Python tests need these and on some we do alerting.
// //

View File

@@ -33,52 +33,6 @@ impl Value {
} }
} }
#[cfg(test)]
#[derive(Debug, PartialEq)]
pub(crate) enum InvalidInput {
TooShortValue,
TooShortPostgresRecord,
}
/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
/// use this type for querying if a slice looks some particular way.
#[cfg(test)]
pub(crate) struct ValueBytes;
#[cfg(test)]
impl ValueBytes {
pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
if raw.len() < 12 {
return Err(InvalidInput::TooShortValue);
}
let value_discriminator = &raw[0..4];
if value_discriminator == [0, 0, 0, 0] {
// Value::Image always initializes
return Ok(true);
}
if value_discriminator != [0, 0, 0, 1] {
// not a Value::WalRecord(..)
return Ok(false);
}
let walrecord_discriminator = &raw[4..8];
if walrecord_discriminator != [0, 0, 0, 0] {
// only NeonWalRecord::Postgres can have will_init
return Ok(false);
}
if raw.len() < 17 {
return Err(InvalidInput::TooShortPostgresRecord);
}
Ok(raw[8] == 1)
}
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;
@@ -116,8 +70,6 @@ mod test {
]; ];
roundtrip!(image, expected); roundtrip!(image, expected);
assert!(ValueBytes::will_init(&expected).unwrap());
} }
#[test] #[test]
@@ -141,96 +93,6 @@ mod test {
]; ];
roundtrip!(rec, expected); roundtrip!(rec, expected);
assert!(ValueBytes::will_init(&expected).unwrap());
}
#[test]
fn bytes_inspection_too_short_image() {
let rec = Value::Image(Bytes::from_static(b""));
#[rustfmt::skip]
let expected = [
// top level discriminator of 4 bytes
0x00, 0x00, 0x00, 0x00,
// 8 byte length
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
];
roundtrip!(rec, expected);
assert!(ValueBytes::will_init(&expected).unwrap());
assert_eq!(expected.len(), 12);
for len in 0..12 {
assert_eq!(
ValueBytes::will_init(&expected[..len]).unwrap_err(),
InvalidInput::TooShortValue
);
}
}
#[test]
fn bytes_inspection_too_short_postgres_record() {
let rec = NeonWalRecord::Postgres {
will_init: false,
rec: Bytes::from_static(b""),
};
let rec = Value::WalRecord(rec);
#[rustfmt::skip]
let expected = [
// flattened discriminator of total 8 bytes
0x00, 0x00, 0x00, 0x01,
0x00, 0x00, 0x00, 0x00,
// will_init
0x00,
// 8 byte length
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
];
roundtrip!(rec, expected);
assert!(!ValueBytes::will_init(&expected).unwrap());
assert_eq!(expected.len(), 17);
for len in 12..17 {
assert_eq!(
ValueBytes::will_init(&expected[..len]).unwrap_err(),
InvalidInput::TooShortPostgresRecord
)
}
for len in 0..12 {
assert_eq!(
ValueBytes::will_init(&expected[..len]).unwrap_err(),
InvalidInput::TooShortValue
)
}
}
#[test]
fn clear_visibility_map_flags_example() {
let rec = NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno: Some(0x11),
old_heap_blkno: None,
flags: 0x03,
};
let rec = Value::WalRecord(rec);
#[rustfmt::skip]
let expected = [
// discriminators
0x00, 0x00, 0x00, 0x01,
0x00, 0x00, 0x00, 0x01,
// Some == 1 followed by 4 bytes
0x01, 0x00, 0x00, 0x00, 0x11,
// None == 0
0x00,
// flags
0x03
];
roundtrip!(rec, expected);
assert!(!ValueBytes::will_init(&expected).unwrap());
} }
} }

View File

@@ -33,14 +33,13 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::fmt; use std::fmt;
use std::future::Future; use std::future::Future;
use std::num::NonZeroUsize;
use std::panic::AssertUnwindSafe; use std::panic::AssertUnwindSafe;
use std::str::FromStr;
use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use futures::FutureExt; use futures::FutureExt;
use pageserver_api::shard::TenantShardId; use pageserver_api::shard::TenantShardId;
use tokio::runtime::Runtime;
use tokio::task::JoinHandle; use tokio::task::JoinHandle;
use tokio::task_local; use tokio::task_local;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
@@ -49,11 +48,8 @@ use tracing::{debug, error, info, warn};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use utils::env;
use utils::id::TimelineId; use utils::id::TimelineId;
use crate::metrics::set_tokio_runtime_setup;
// //
// There are four runtimes: // There are four runtimes:
// //
@@ -102,119 +98,52 @@ use crate::metrics::set_tokio_runtime_setup;
// other operations, if the upload tasks e.g. get blocked on locks. It shouldn't // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
// happen, but still. // happen, but still.
// //
pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("compute request worker")
.enable_all()
.build()
.expect("Failed to create compute request runtime")
});
pub(crate) static TOKIO_WORKER_THREADS: Lazy<NonZeroUsize> = Lazy::new(|| { pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("mgmt request worker")
.enable_all()
.build()
.expect("Failed to create mgmt request runtime")
});
pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("walreceiver worker")
.enable_all()
.build()
.expect("Failed to create walreceiver runtime")
});
pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("background op worker")
// if you change the number of worker threads please change the constant below
.enable_all()
.build()
.expect("Failed to create background op runtime")
});
pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
// force init and thus panics
let _ = BACKGROUND_RUNTIME.handle();
// replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
// tokio would had already panicked for parsing errors or NotUnicode // tokio would had already panicked for parsing errors or NotUnicode
// //
// this will be wrong if any of the runtimes gets their worker threads configured to something // this will be wrong if any of the runtimes gets their worker threads configured to something
// else, but that has not been needed in a long time. // else, but that has not been needed in a long time.
NonZeroUsize::new( std::env::var("TOKIO_WORKER_THREADS")
std::env::var("TOKIO_WORKER_THREADS") .map(|s| s.parse::<usize>().unwrap())
.map(|s| s.parse::<usize>().unwrap()) .unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
.unwrap_or_else(|_e| usize::max(2, num_cpus::get())),
)
.expect("the max() ensures that this is not zero")
}); });
enum TokioRuntimeMode {
SingleThreaded,
MultiThreaded { num_workers: NonZeroUsize },
}
impl FromStr for TokioRuntimeMode {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"current_thread" => Ok(TokioRuntimeMode::SingleThreaded),
s => match s.strip_prefix("multi_thread:") {
Some("default") => Ok(TokioRuntimeMode::MultiThreaded {
num_workers: *TOKIO_WORKER_THREADS,
}),
Some(suffix) => {
let num_workers = suffix.parse::<NonZeroUsize>().map_err(|e| {
format!(
"invalid number of multi-threaded runtime workers ({suffix:?}): {e}",
)
})?;
Ok(TokioRuntimeMode::MultiThreaded { num_workers })
}
None => Err(format!("invalid runtime config: {s:?}")),
},
}
}
}
static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
let thread_name = "pageserver-tokio";
let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
// If the env var is not set, leave this static as None.
set_tokio_runtime_setup(
"multiple-runtimes",
NUM_MULTIPLE_RUNTIMES
.checked_mul(*TOKIO_WORKER_THREADS)
.unwrap(),
);
return None;
};
Some(match mode {
TokioRuntimeMode::SingleThreaded => {
set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap());
tokio::runtime::Builder::new_current_thread()
.thread_name(thread_name)
.enable_all()
.build()
.expect("failed to create one single runtime")
}
TokioRuntimeMode::MultiThreaded { num_workers } => {
set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers);
tokio::runtime::Builder::new_multi_thread()
.thread_name(thread_name)
.enable_all()
.worker_threads(num_workers.get())
.build()
.expect("failed to create one multi-threaded runtime")
}
})
});
/// Declare a lazy static variable named `$varname` that will resolve
/// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME`
/// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation
/// declares a separate runtime and the lazy static variable `$varname`
/// will resolve to that separate runtime.
///
/// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if
/// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime
/// otherwise.
macro_rules! pageserver_runtime {
($varname:ident, $name:literal) => {
pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| {
if let Some(runtime) = &*ONE_RUNTIME {
return runtime;
}
static RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name($name)
.worker_threads(TOKIO_WORKER_THREADS.get())
.enable_all()
.build()
.expect(std::concat!("Failed to create runtime ", $name))
});
&*RUNTIME
});
};
}
pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker");
pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker");
pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker");
pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker");
// Bump this number when adding a new pageserver_runtime!
// SAFETY: it's obviously correct
const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) };
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct PageserverTaskId(u64); pub struct PageserverTaskId(u64);

View File

@@ -386,7 +386,7 @@ impl WalRedoManager {
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> { pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
match self { match self {
WalRedoManager::Prod(m) => Some(m.status()), WalRedoManager::Prod(m) => m.status(),
#[cfg(test)] #[cfg(test)]
WalRedoManager::Test(_) => None, WalRedoManager::Test(_) => None,
} }
@@ -3190,7 +3190,7 @@ impl Tenant {
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?; run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
// Upload the created data dir to S3 // Upload the created data dir to S3
if self.tenant_shard_id().is_shard_zero() { if self.tenant_shard_id().is_zero() {
self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id) self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
.await?; .await?;
} }
@@ -3437,7 +3437,7 @@ impl Tenant {
.store(size, Ordering::Relaxed); .store(size, Ordering::Relaxed);
// Only shard zero should be calculating synthetic sizes // Only shard zero should be calculating synthetic sizes
debug_assert!(self.shard_identity.is_shard_zero()); debug_assert!(self.shard_identity.is_zero());
TENANT_SYNTHETIC_SIZE_METRIC TENANT_SYNTHETIC_SIZE_METRIC
.get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()]) .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
@@ -3848,8 +3848,6 @@ pub(crate) mod harness {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::collections::BTreeMap;
use super::*; use super::*;
use crate::keyspace::KeySpaceAccum; use crate::keyspace::KeySpaceAccum;
use crate::repository::{Key, Value}; use crate::repository::{Key, Value};
@@ -3860,7 +3858,7 @@ mod tests {
use hex_literal::hex; use hex_literal::hex;
use pageserver_api::keyspace::KeySpace; use pageserver_api::keyspace::KeySpace;
use rand::{thread_rng, Rng}; use rand::{thread_rng, Rng};
use tests::timeline::{GetVectoredError, ShutdownMode}; use tests::timeline::ShutdownMode;
static TEST_KEY: Lazy<Key> = static TEST_KEY: Lazy<Key> =
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4796,166 +4794,6 @@ mod tests {
Ok(()) Ok(())
} }
// Test that vectored get descends into ancestor timelines correctly and
// does not return an image that's newer than requested.
//
// The diagram below ilustrates an interesting case. We have a parent timeline
// (top of the Lsn range) and a child timeline. The request key cannot be reconstructed
// from the child timeline, so the parent timeline must be visited. When advacing into
// the child timeline, the read path needs to remember what the requested Lsn was in
// order to avoid returning an image that's too new. The test below constructs such
// a timeline setup and does a few queries around the Lsn of each page image.
// ```
// LSN
// ^
// |
// |
// 500 | --------------------------------------> branch point
// 400 | X
// 300 | X
// 200 | --------------------------------------> requested lsn
// 100 | X
// |---------------------------------------> Key
// |
// ------> requested key
//
// Legend:
// * X - page images
// ```
#[tokio::test]
async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
let (tenant, ctx) = harness.load().await;
let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let end_key = start_key.add(1000);
let child_gap_at_key = start_key.add(500);
let mut parent_gap_lsns: BTreeMap<Lsn, String> = BTreeMap::new();
let mut current_lsn = Lsn(0x10);
let timeline_id = TimelineId::generate();
let parent_timeline = tenant
.create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
.await?;
current_lsn += 0x100;
for _ in 0..3 {
let mut key = start_key;
while key < end_key {
current_lsn += 0x10;
let image_value = format!("{} at {}", child_gap_at_key, current_lsn);
let mut writer = parent_timeline.writer().await;
writer
.put(
key,
current_lsn,
&Value::Image(test_img(&image_value)),
&ctx,
)
.await?;
writer.finish_write(current_lsn);
if key == child_gap_at_key {
parent_gap_lsns.insert(current_lsn, image_value);
}
key = key.next();
}
parent_timeline.freeze_and_flush().await?;
}
let child_timeline_id = TimelineId::generate();
let child_timeline = tenant
.branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx)
.await?;
let mut key = start_key;
while key < end_key {
if key == child_gap_at_key {
key = key.next();
continue;
}
current_lsn += 0x10;
let mut writer = child_timeline.writer().await;
writer
.put(
key,
current_lsn,
&Value::Image(test_img(&format!("{} at {}", key, current_lsn))),
&ctx,
)
.await?;
writer.finish_write(current_lsn);
key = key.next();
}
child_timeline.freeze_and_flush().await?;
let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10];
let mut query_lsns = Vec::new();
for image_lsn in parent_gap_lsns.keys().rev() {
for offset in lsn_offsets {
query_lsns.push(Lsn(image_lsn
.0
.checked_add_signed(offset)
.expect("Shouldn't overflow")));
}
}
for query_lsn in query_lsns {
let results = child_timeline
.get_vectored_impl(
KeySpace {
ranges: vec![child_gap_at_key..child_gap_at_key.next()],
},
query_lsn,
&ctx,
)
.await;
let expected_item = parent_gap_lsns
.iter()
.rev()
.find(|(lsn, _)| **lsn <= query_lsn);
info!(
"Doing vectored read at LSN {}. Expecting image to be: {:?}",
query_lsn, expected_item
);
match expected_item {
Some((_, img_value)) => {
let key_results = results.expect("No vectored get error expected");
let key_result = &key_results[&child_gap_at_key];
let returned_img = key_result
.as_ref()
.expect("No page reconstruct error expected");
info!(
"Vectored read at LSN {} returned image {}",
query_lsn,
std::str::from_utf8(returned_img)?
);
assert_eq!(*returned_img, test_img(img_value));
}
None => {
assert!(matches!(results, Err(GetVectoredError::MissingKey(_))));
}
}
}
Ok(())
}
#[tokio::test] #[tokio::test]
async fn test_random_updates() -> anyhow::Result<()> { async fn test_random_updates() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_random_updates")?; let harness = TenantHarness::create("test_random_updates")?;

View File

@@ -436,11 +436,6 @@ impl DeleteTenantFlow {
.await .await
} }
/// Check whether background deletion of this tenant is currently in progress
pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
tenant.delete_progress.try_lock().is_err()
}
async fn prepare( async fn prepare(
tenant: &Arc<Tenant>, tenant: &Arc<Tenant>,
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> { ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {

View File

@@ -678,19 +678,12 @@ pub async fn init_tenant_mgr(
} }
} }
} }
LocationMode::Secondary(secondary_conf) => { LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
info!( tenant_shard_id,
tenant_id = %tenant_shard_id.tenant_id, shard_identity,
shard_id = %tenant_shard_id.shard_slug(), location_conf.tenant_conf,
"Starting secondary tenant" &secondary_conf,
); )),
TenantSlot::Secondary(SecondaryTenant::new(
tenant_shard_id,
shard_identity,
location_conf.tenant_conf,
&secondary_conf,
))
}
}; };
tenants.insert(tenant_shard_id, slot); tenants.insert(tenant_shard_id, slot);
@@ -1417,15 +1410,9 @@ impl TenantManager {
match tenant.current_state() { match tenant.current_state() {
TenantState::Broken { .. } | TenantState::Stopping { .. } => { TenantState::Broken { .. } | TenantState::Stopping { .. } => {
// If deletion is already in progress, return success (the semantics of this // If a tenant is broken or stopping, DeleteTenantFlow can
// function are to rerturn success afterr deletion is spawned in background). // handle it: broken tenants proceed to delete, stopping tenants
// Otherwise fall through and let [`DeleteTenantFlow`] handle this state. // are checked for deletion already in progress.
if DeleteTenantFlow::is_in_progress(&tenant) {
// The `delete_progress` lock is held: deletion is already happening
// in the bacckground
slot_guard.revert();
return Ok(());
}
} }
_ => { _ => {
tenant tenant

View File

@@ -167,7 +167,7 @@ pub(crate) async fn time_travel_recover_tenant(
let warn_after = 3; let warn_after = 3;
let max_attempts = 10; let max_attempts = 10;
let mut prefixes = Vec::with_capacity(2); let mut prefixes = Vec::with_capacity(2);
if tenant_shard_id.is_shard_zero() { if tenant_shard_id.is_zero() {
// Also recover the unsharded prefix for a shard of zero: // Also recover the unsharded prefix for a shard of zero:
// - if the tenant is totally unsharded, the unsharded prefix contains all the data // - if the tenant is totally unsharded, the unsharded prefix contains all the data
// - if the tenant is sharded, we still want to recover the initdb data, but we only // - if the tenant is sharded, we still want to recover the initdb data, but we only

View File

@@ -51,7 +51,7 @@ use tokio_util::sync::CancellationToken;
use tracing::{info_span, instrument, warn, Instrument}; use tracing::{info_span, instrument, warn, Instrument};
use utils::{ use utils::{
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
id::TimelineId, serde_system_time, id::TimelineId,
}; };
use super::{ use super::{
@@ -312,7 +312,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
(detail.last_download, detail.next_download.unwrap()) (detail.last_download, detail.next_download.unwrap())
}; };
if now > next_download { if now < next_download {
Some(PendingDownload { Some(PendingDownload {
secondary_state: secondary_tenant, secondary_state: secondary_tenant,
last_download, last_download,
@@ -591,7 +591,7 @@ impl<'a> TenantDownloader<'a> {
let mut progress = SecondaryProgress { let mut progress = SecondaryProgress {
layers_total: heatmap_stats.layers, layers_total: heatmap_stats.layers,
bytes_total: heatmap_stats.bytes, bytes_total: heatmap_stats.bytes,
heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)), heatmap_mtime: Some(heatmap_mtime),
layers_downloaded: 0, layers_downloaded: 0,
bytes_downloaded: 0, bytes_downloaded: 0,
}; };
@@ -647,12 +647,6 @@ impl<'a> TenantDownloader<'a> {
progress.bytes_downloaded += layer_byte_count; progress.bytes_downloaded += layer_byte_count;
progress.layers_downloaded += layer_count; progress.layers_downloaded += layer_count;
} }
for delete_timeline in &delete_timelines {
// We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
// from disk fails that will be a fatal error.
detail.timelines.remove(delete_timeline);
}
} }
// Execute accumulated deletions // Execute accumulated deletions
@@ -716,14 +710,13 @@ impl<'a> TenantDownloader<'a> {
.await .await
.map_err(UpdateError::from)?; .map_err(UpdateError::from)?;
SECONDARY_MODE.download_heatmap.inc();
if Some(&download.etag) == prev_etag { if Some(&download.etag) == prev_etag {
Ok(HeatMapDownload::Unmodified) Ok(HeatMapDownload::Unmodified)
} else { } else {
let mut heatmap_bytes = Vec::new(); let mut heatmap_bytes = Vec::new();
let mut body = tokio_util::io::StreamReader::new(download.download_stream); let mut body = tokio_util::io::StreamReader::new(download.download_stream);
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?; let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
SECONDARY_MODE.download_heatmap.inc();
Ok(HeatMapDownload::Modified(HeatMapModified { Ok(HeatMapDownload::Modified(HeatMapModified {
etag: download.etag, etag: download.etag,
last_modified: download.last_modified, last_modified: download.last_modified,

View File

@@ -20,8 +20,8 @@
//! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051 //! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
//! ``` //! ```
//! //!
//! Every delta file consists of three parts: "summary", "values", and //! Every delta file consists of three parts: "summary", "index", and
//! "index". The summary is a fixed size header at the beginning of the file, //! "values". The summary is a fixed size header at the beginning of the file,
//! and it contains basic information about the layer, and offsets to the other //! and it contains basic information about the layer, and offsets to the other
//! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the //! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the
//! "values" part. The actual page images and WAL records are stored in the //! "values" part. The actual page images and WAL records are stored in the
@@ -863,7 +863,7 @@ impl DeltaLayerInner {
.into(), .into(),
); );
let data_end_offset = self.index_start_offset(); let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64;
let reads = Self::plan_reads( let reads = Self::plan_reads(
keyspace, keyspace,
@@ -939,7 +939,7 @@ impl DeltaLayerInner {
} }
if !range_end_handled { if !range_end_handled {
tracing::debug!("Handling range end fallback at {}", data_end_offset); tracing::info!("Handling range end fallback at {}", data_end_offset);
planner.handle_range_end(data_end_offset); planner.handle_range_end(data_end_offset);
} }
} }
@@ -1103,195 +1103,11 @@ impl DeltaLayerInner {
if let Some(last) = all_keys.last_mut() { if let Some(last) = all_keys.last_mut() {
// Last key occupies all space till end of value storage, // Last key occupies all space till end of value storage,
// which corresponds to beginning of the index // which corresponds to beginning of the index
last.size = self.index_start_offset() - last.size; last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
} }
Ok(all_keys) Ok(all_keys)
} }
/// Using the given writer, write out a truncated version, where LSNs higher than the
/// truncate_at are missing.
#[cfg(test)]
pub(super) async fn copy_prefix(
&self,
writer: &mut DeltaLayerWriter,
truncate_at: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<()> {
use crate::tenant::vectored_blob_io::{
BlobMeta, VectoredReadBuilder, VectoredReadExtended,
};
use futures::stream::TryStreamExt;
#[derive(Debug)]
enum Item {
Actual(Key, Lsn, BlobRef),
Sentinel,
}
impl From<Item> for Option<(Key, Lsn, BlobRef)> {
fn from(value: Item) -> Self {
match value {
Item::Actual(key, lsn, blob) => Some((key, lsn, blob)),
Item::Sentinel => None,
}
}
}
impl Item {
fn offset(&self) -> Option<BlobRef> {
match self {
Item::Actual(_, _, blob) => Some(*blob),
Item::Sentinel => None,
}
}
fn is_last(&self) -> bool {
matches!(self, Item::Sentinel)
}
}
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
self.index_start_blk,
self.index_root_blk,
block_reader,
);
let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos));
// put in a sentinel value for getting the end offset for last item, and not having to
// repeat the whole read part
let stream = stream.chain(futures::stream::once(futures::future::ready(Ok(
Item::Sentinel,
))));
let mut stream = std::pin::pin!(stream);
let mut prev: Option<(Key, Lsn, BlobRef)> = None;
let mut read_builder: Option<VectoredReadBuilder> = None;
let max_read_size = self
.max_vectored_read_bytes
.map(|x| x.0.get())
.unwrap_or(8192);
let mut buffer = Some(BytesMut::with_capacity(max_read_size));
// FIXME: buffering of DeltaLayerWriter
let mut per_blob_copy = Vec::new();
while let Some(item) = stream.try_next().await? {
tracing::debug!(?item, "popped");
let offset = item
.offset()
.unwrap_or(BlobRef::new(self.index_start_offset(), false));
let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
let end_offset = offset;
Some((BlobMeta { key, lsn }, start_offset..end_offset))
} else {
None
};
let is_last = item.is_last();
prev = Option::from(item);
let actionable = actionable.filter(|x| x.0.lsn < truncate_at);
let builder = if let Some((meta, offsets)) = actionable {
// extend or create a new builder
if read_builder
.as_mut()
.map(|x| x.extend(offsets.start.pos(), offsets.end.pos(), meta))
.unwrap_or(VectoredReadExtended::No)
== VectoredReadExtended::Yes
{
None
} else {
read_builder.replace(VectoredReadBuilder::new(
offsets.start.pos(),
offsets.end.pos(),
meta,
max_read_size,
))
}
} else {
// nothing to do, except perhaps flush any existing for the last element
None
};
// flush the possible older builder and also the new one if the item was the last one
let builders = builder.into_iter();
let builders = if is_last {
builders.chain(read_builder.take())
} else {
builders.chain(None)
};
for builder in builders {
let read = builder.build();
let reader = VectoredBlobReader::new(&self.file);
let mut buf = buffer.take().unwrap();
buf.clear();
buf.reserve(read.size());
let res = reader.read_blobs(&read, buf).await?;
for blob in res.blobs {
let key = blob.meta.key;
let lsn = blob.meta.lsn;
let data = &res.buf[blob.start..blob.end];
#[cfg(debug_assertions)]
Value::des(data)
.with_context(|| {
format!(
"blob failed to deserialize for {}@{}, {}..{}: {:?}",
blob.meta.key,
blob.meta.lsn,
blob.start,
blob.end,
utils::Hex(data)
)
})
.unwrap();
// is it an image or will_init walrecord?
// FIXME: this could be handled by threading the BlobRef to the
// VectoredReadBuilder
let will_init = crate::repository::ValueBytes::will_init(data)
.inspect_err(|_e| {
#[cfg(feature = "testing")]
tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value");
})
.unwrap_or(false);
per_blob_copy.clear();
per_blob_copy.extend_from_slice(data);
let (tmp, res) = writer
.put_value_bytes(key, lsn, std::mem::take(&mut per_blob_copy), will_init)
.await;
per_blob_copy = tmp;
res?;
}
buffer = Some(res.buf);
}
}
assert!(
read_builder.is_none(),
"with the sentinel above loop should had handled all"
);
Ok(())
}
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> { pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
println!( println!(
"index_start_blk: {}, root {}", "index_start_blk: {}, root {}",
@@ -1361,44 +1177,6 @@ impl DeltaLayerInner {
Ok(()) Ok(())
} }
#[cfg(test)]
fn stream_index_forwards<'a, R>(
&'a self,
reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
start: &'a [u8; DELTA_KEY_SIZE],
ctx: &'a RequestContext,
) -> impl futures::stream::Stream<
Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>,
> + 'a
where
R: BlockReader,
{
use futures::stream::TryStreamExt;
let stream = reader.get_stream_from(start, ctx);
stream.map_ok(|(key, value)| {
let key = DeltaKey::from_slice(&key);
let (key, lsn) = (key.key(), key.lsn());
let offset = BlobRef(value);
(key, lsn, offset)
})
}
/// The file offset to the first block of index.
///
/// The file structure is summary, values, and index. We often need this for the size of last blob.
fn index_start_offset(&self) -> u64 {
let offset = self.index_start_blk as u64 * PAGE_SZ as u64;
let bref = BlobRef(offset);
tracing::debug!(
index_start_blk = self.index_start_blk,
offset,
pos = bref.pos(),
"index_start_offset"
);
offset
}
} }
/// A set of data associated with a delta layer key and its value /// A set of data associated with a delta layer key and its value
@@ -1760,7 +1538,7 @@ mod test {
let resident = writer.finish(entries_meta.key_range.end, &timeline).await?; let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
let inner = resident.as_delta(&ctx).await?; let inner = resident.get_inner_delta(&ctx).await?;
let file_size = inner.file.metadata().await?.len(); let file_size = inner.file.metadata().await?.len();
tracing::info!( tracing::info!(
@@ -1816,217 +1594,4 @@ mod test {
Ok(()) Ok(())
} }
#[tokio::test]
async fn copy_delta_prefix_smoke() {
use crate::walrecord::NeonWalRecord;
use bytes::Bytes;
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
let (tenant, ctx) = h.load().await;
let ctx = &ctx;
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
.await
.unwrap();
let initdb_layer = timeline
.layers
.read()
.await
.likely_resident_layers()
.next()
.unwrap();
{
let mut writer = timeline.writer().await;
let data = [
(0x20, 12, Value::Image(Bytes::from_static(b"foobar"))),
(
0x30,
12,
Value::WalRecord(NeonWalRecord::Postgres {
will_init: false,
rec: Bytes::from_static(b"1"),
}),
),
(
0x40,
12,
Value::WalRecord(NeonWalRecord::Postgres {
will_init: true,
rec: Bytes::from_static(b"2"),
}),
),
// build an oversized value so we cannot extend and existing read over
// this
(
0x50,
12,
Value::WalRecord(NeonWalRecord::Postgres {
will_init: true,
rec: {
let mut buf =
vec![0u8; tenant.conf.max_vectored_read_bytes.0.get() + 1024];
buf.iter_mut()
.enumerate()
.for_each(|(i, slot)| *slot = (i % 256) as u8);
Bytes::from(buf)
},
}),
),
// because the oversized read cannot be extended further, we are sure to exercise the
// builder created on the last round with this:
(
0x60,
12,
Value::WalRecord(NeonWalRecord::Postgres {
will_init: true,
rec: Bytes::from_static(b"3"),
}),
),
(
0x60,
9,
Value::Image(Bytes::from_static(b"something for a different key")),
),
];
let mut last_lsn = None;
for (lsn, key, value) in data {
let key = Key::from_i128(key);
writer.put(key, Lsn(lsn), &value, ctx).await.unwrap();
last_lsn = Some(lsn);
}
writer.finish_write(Lsn(last_lsn.unwrap()));
}
timeline.freeze_and_flush().await.unwrap();
let new_layer = timeline
.layers
.read()
.await
.likely_resident_layers()
.find(|x| x != &initdb_layer)
.unwrap();
// create a copy for the timeline, so we don't overwrite the file
let branch = tenant
.branch_timeline_test(&timeline, TimelineId::generate(), None, ctx)
.await
.unwrap();
assert_eq!(branch.get_ancestor_lsn(), Lsn(0x60));
// truncating at 0x61 gives us a full copy, otherwise just go backwards until there's just
// a single key
for truncate_at in [0x61, 0x51, 0x41, 0x31, 0x21] {
let truncate_at = Lsn(truncate_at);
let mut writer = DeltaLayerWriter::new(
tenant.conf,
branch.timeline_id,
tenant.tenant_shard_id,
Key::MIN,
Lsn(0x11)..truncate_at,
)
.await
.unwrap();
let new_layer = new_layer.download_and_keep_resident().await.unwrap();
new_layer
.copy_delta_prefix(&mut writer, truncate_at, ctx)
.await
.unwrap();
let copied_layer = writer.finish(Key::MAX, &branch).await.unwrap();
copied_layer.as_delta(ctx).await.unwrap();
assert_keys_and_values_eq(
new_layer.as_delta(ctx).await.unwrap(),
copied_layer.as_delta(ctx).await.unwrap(),
truncate_at,
ctx,
)
.await;
}
}
async fn assert_keys_and_values_eq(
source: &DeltaLayerInner,
truncated: &DeltaLayerInner,
truncated_at: Lsn,
ctx: &RequestContext,
) {
use futures::future::ready;
use futures::stream::TryStreamExt;
let start_key = [0u8; DELTA_KEY_SIZE];
let source_reader = FileBlockReader::new(&source.file, source.file_id);
let source_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
source.index_start_blk,
source.index_root_blk,
&source_reader,
);
let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx);
let source_stream = source_stream.filter(|res| match res {
Ok((_, lsn, _)) => ready(lsn < &truncated_at),
_ => ready(true),
});
let mut source_stream = std::pin::pin!(source_stream);
let truncated_reader = FileBlockReader::new(&truncated.file, truncated.file_id);
let truncated_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
truncated.index_start_blk,
truncated.index_root_blk,
&truncated_reader,
);
let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx);
let mut truncated_stream = std::pin::pin!(truncated_stream);
let mut scratch_left = Vec::new();
let mut scratch_right = Vec::new();
loop {
let (src, truncated) = (source_stream.try_next(), truncated_stream.try_next());
let (src, truncated) = tokio::try_join!(src, truncated).unwrap();
if src.is_none() {
assert!(truncated.is_none());
break;
}
let (src, truncated) = (src.unwrap(), truncated.unwrap());
// because we've filtered the source with Lsn, we should always have the same keys from both.
assert_eq!(src.0, truncated.0);
assert_eq!(src.1, truncated.1);
// if this is needed for something else, just drop this assert.
assert!(
src.2.pos() >= truncated.2.pos(),
"value position should not go backwards {} vs. {}",
src.2.pos(),
truncated.2.pos()
);
scratch_left.clear();
let src_cursor = source_reader.block_cursor();
let left = src_cursor.read_blob_into_buf(src.2.pos(), &mut scratch_left, ctx);
scratch_right.clear();
let trunc_cursor = truncated_reader.block_cursor();
let right = trunc_cursor.read_blob_into_buf(truncated.2.pos(), &mut scratch_right, ctx);
tokio::try_join!(left, right).unwrap();
assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right));
}
}
} }

View File

@@ -19,7 +19,6 @@ use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId; use pageserver_api::shard::TenantShardId;
use std::collections::{BinaryHeap, HashMap, HashSet}; use std::collections::{BinaryHeap, HashMap, HashSet};
use std::sync::{Arc, OnceLock}; use std::sync::{Arc, OnceLock};
use std::time::Instant;
use tracing::*; use tracing::*;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
// avoid binding to Write (conflicts with std::io::Write) // avoid binding to Write (conflicts with std::io::Write)
@@ -54,8 +53,6 @@ pub struct InMemoryLayer {
/// Writes are only allowed when this is `None`. /// Writes are only allowed when this is `None`.
end_lsn: OnceLock<Lsn>, end_lsn: OnceLock<Lsn>,
opened_at: Instant,
/// The above fields never change, except for `end_lsn`, which is only set once. /// The above fields never change, except for `end_lsn`, which is only set once.
/// All other changing parts are in `inner`, and protected by a mutex. /// All other changing parts are in `inner`, and protected by a mutex.
inner: RwLock<InMemoryLayerInner>, inner: RwLock<InMemoryLayerInner>,
@@ -463,7 +460,6 @@ impl InMemoryLayer {
tenant_shard_id, tenant_shard_id,
start_lsn, start_lsn,
end_lsn: OnceLock::new(), end_lsn: OnceLock::new(),
opened_at: Instant::now(),
inner: RwLock::new(InMemoryLayerInner { inner: RwLock::new(InMemoryLayerInner {
index: HashMap::new(), index: HashMap::new(),
file, file,
@@ -524,10 +520,6 @@ impl InMemoryLayer {
Ok(()) Ok(())
} }
pub(crate) fn get_opened_at(&self) -> Instant {
self.opened_at
}
pub(crate) async fn tick(&self) -> Option<u64> { pub(crate) async fn tick(&self) -> Option<u64> {
let mut inner = self.inner.write().await; let mut inner = self.inner.write().await;
let size = inner.file.len(); let size = inner.file.len();

View File

@@ -116,12 +116,6 @@ impl AsLayerDesc for Layer {
} }
} }
impl PartialEq for Layer {
fn eq(&self, other: &Self) -> bool {
Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0)
}
}
impl Layer { impl Layer {
/// Creates a layer value for a file we know to not be resident. /// Creates a layer value for a file we know to not be resident.
pub(crate) fn for_evicted( pub(crate) fn for_evicted(
@@ -610,17 +604,9 @@ enum Status {
impl Drop for LayerInner { impl Drop for LayerInner {
fn drop(&mut self) { fn drop(&mut self) {
// if there was a pending eviction, mark it cancelled here to balance metrics
if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit()
{
// eviction has already been started
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
// eviction request is intentionally not honored as no one is present to wait for it
// and we could be delaying shutdown for nothing.
}
if !*self.wanted_deleted.get_mut() { if !*self.wanted_deleted.get_mut() {
// should we try to evict if the last wish was for eviction? seems more like a hazard
// than a clear win.
return; return;
} }
@@ -1566,8 +1552,8 @@ impl Drop for DownloadedLayer {
if let Some(owner) = self.owner.upgrade() { if let Some(owner) = self.owner.upgrade() {
owner.on_downloaded_layer_drop(self.version); owner.on_downloaded_layer_drop(self.version);
} else { } else {
// Layer::drop will handle cancelling the eviction; because of drop order and // no need to do anything, we are shutting down
// `DownloadedLayer` never leaking, we cannot know here if eviction was requested. LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
} }
} }
} }
@@ -1766,28 +1752,6 @@ impl ResidentLayer {
} }
} }
/// FIXME: truncate is bad name because we are not truncating anything, but copying the
/// filtered parts.
#[cfg(test)]
pub(super) async fn copy_delta_prefix(
&self,
writer: &mut super::delta_layer::DeltaLayerWriter,
truncate_at: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<()> {
use LayerKind::*;
let owner = &self.owner.0;
match self.downloaded.get(owner, ctx).await? {
Delta(ref d) => d
.copy_prefix(writer, truncate_at, ctx)
.await
.with_context(|| format!("truncate {self}")),
Image(_) => anyhow::bail!(format!("cannot truncate image layer {self}")),
}
}
pub(crate) fn local_path(&self) -> &Utf8Path { pub(crate) fn local_path(&self) -> &Utf8Path {
&self.owner.0.path &self.owner.0.path
} }
@@ -1797,14 +1761,14 @@ impl ResidentLayer {
} }
#[cfg(test)] #[cfg(test)]
pub(crate) async fn as_delta( pub(crate) async fn get_inner_delta<'a>(
&self, &'a self,
ctx: &RequestContext, ctx: &RequestContext,
) -> anyhow::Result<&delta_layer::DeltaLayerInner> { ) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
use LayerKind::*; let owner = &self.owner.0;
match self.downloaded.get(&self.owner.0, ctx).await? { match self.downloaded.get(owner, ctx).await? {
Delta(ref d) => Ok(d), LayerKind::Delta(d) => Ok(d),
Image(_) => Err(anyhow::anyhow!("image layer")), LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
} }
} }
} }

View File

@@ -721,103 +721,6 @@ async fn evict_and_wait_does_not_wait_for_download() {
layer.evict_and_wait(FOREVER).await.unwrap(); layer.evict_and_wait(FOREVER).await.unwrap();
} }
/// Asserts that there is no miscalculation when Layer is dropped while it is being kept resident,
/// which is the last value.
///
/// Also checks that the same does not happen on a non-evicted layer (regression test).
#[tokio::test(start_paused = true)]
async fn eviction_cancellation_on_drop() {
use crate::repository::Value;
use bytes::Bytes;
// this is the runtime on which Layer spawns the blocking tasks on
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
let (tenant, ctx) = h.load().await;
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
{
// create_test_timeline wrote us one layer, write another
let mut writer = timeline.writer().await;
writer
.put(
Key::from_i128(5),
Lsn(0x20),
&Value::Image(Bytes::from_static(b"this does not matter either")),
&ctx,
)
.await
.unwrap();
writer.finish_write(Lsn(0x20));
}
timeline.freeze_and_flush().await.unwrap();
// wait for the upload to complete so our Arc::strong_count assertion holds
timeline
.remote_client
.as_ref()
.unwrap()
.wait_completion()
.await
.unwrap();
let (evicted_layer, not_evicted) = {
let mut layers = {
let mut guard = timeline.layers.write().await;
let layers = guard.likely_resident_layers().collect::<Vec<_>>();
// remove the layers from layermap
guard.finish_gc_timeline(&layers);
layers
};
assert_eq!(layers.len(), 2);
(layers.pop().unwrap(), layers.pop().unwrap())
};
let victims = [(evicted_layer, true), (not_evicted, false)];
for (victim, evict) in victims {
let resident = victim.keep_resident().await.unwrap();
drop(victim);
assert_eq!(Arc::strong_count(&resident.owner.0), 1);
if evict {
let evict_and_wait = resident.owner.evict_and_wait(FOREVER);
// drive the future to await on the status channel, and then drop it
tokio::time::timeout(ADVANCE, evict_and_wait)
.await
.expect_err("should had been a timeout since we are holding the layer resident");
}
// 1 == we only evict one of the layers
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
drop(resident);
// run any spawned
tokio::time::sleep(ADVANCE).await;
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
assert_eq!(
1,
LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get()
);
}
}
#[test] #[test]
fn layer_size() { fn layer_size() {
assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040); assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);

View File

@@ -18,7 +18,7 @@ use utils::{backoff, completion};
static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> = static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
once_cell::sync::Lazy::new(|| { once_cell::sync::Lazy::new(|| {
let total_threads = task_mgr::TOKIO_WORKER_THREADS.get(); let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
let permits = usize::max( let permits = usize::max(
1, 1,
// while a lot of the work is done on spawn_blocking, we still do // while a lot of the work is done on spawn_blocking, we still do
@@ -72,7 +72,6 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation
); );
// TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
match CONCURRENT_BACKGROUND_TASKS.acquire().await { match CONCURRENT_BACKGROUND_TASKS.acquire().await {
Ok(permit) => permit, Ok(permit) => permit,
Err(_closed) => unreachable!("we never close the semaphore"), Err(_closed) => unreachable!("we never close the semaphore"),

View File

@@ -1257,7 +1257,7 @@ impl Timeline {
checkpoint_distance, checkpoint_distance,
self.get_last_record_lsn(), self.get_last_record_lsn(),
self.last_freeze_at.load(), self.last_freeze_at.load(),
open_layer.get_opened_at(), *self.last_freeze_ts.read().unwrap(),
) { ) {
match open_layer.info() { match open_layer.info() {
InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => { InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
@@ -1344,7 +1344,7 @@ impl Timeline {
background_jobs_can_start: Option<&completion::Barrier>, background_jobs_can_start: Option<&completion::Barrier>,
ctx: &RequestContext, ctx: &RequestContext,
) { ) {
if self.tenant_shard_id.is_shard_zero() { if self.tenant_shard_id.is_zero() {
// Logical size is only maintained accurately on shard zero. // Logical size is only maintained accurately on shard zero.
self.spawn_initial_logical_size_computation_task(ctx); self.spawn_initial_logical_size_computation_task(ctx);
} }
@@ -1622,7 +1622,7 @@ impl Timeline {
checkpoint_distance: u64, checkpoint_distance: u64,
projected_lsn: Lsn, projected_lsn: Lsn,
last_freeze_at: Lsn, last_freeze_at: Lsn,
opened_at: Instant, last_freeze_ts: Instant,
) -> bool { ) -> bool {
let distance = projected_lsn.widening_sub(last_freeze_at); let distance = projected_lsn.widening_sub(last_freeze_at);
@@ -1648,13 +1648,13 @@ impl Timeline {
); );
true true
} else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() { } else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
info!( info!(
"Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})", "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
projected_lsn, projected_lsn,
layer_size, layer_size,
opened_at.elapsed() last_freeze_ts.elapsed()
); );
true true
} else { } else {
@@ -2237,7 +2237,7 @@ impl Timeline {
priority: GetLogicalSizePriority, priority: GetLogicalSizePriority,
ctx: &RequestContext, ctx: &RequestContext,
) -> logical_size::CurrentLogicalSize { ) -> logical_size::CurrentLogicalSize {
if !self.tenant_shard_id.is_shard_zero() { if !self.tenant_shard_id.is_zero() {
// Logical size is only accurately maintained on shard zero: when called elsewhere, for example // Logical size is only accurately maintained on shard zero: when called elsewhere, for example
// when HTTP API is serving a GET for timeline zero, return zero // when HTTP API is serving a GET for timeline zero, return zero
return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero()); return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero());
@@ -2533,7 +2533,7 @@ impl Timeline {
crate::span::debug_assert_current_span_has_tenant_and_timeline_id(); crate::span::debug_assert_current_span_has_tenant_and_timeline_id();
// We should never be calculating logical sizes on shard !=0, because these shards do not have // We should never be calculating logical sizes on shard !=0, because these shards do not have
// accurate relation sizes, and they do not emit consumption metrics. // accurate relation sizes, and they do not emit consumption metrics.
debug_assert!(self.tenant_shard_id.is_shard_zero()); debug_assert!(self.tenant_shard_id.is_zero());
let guard = self let guard = self
.gate .gate
@@ -2968,8 +2968,7 @@ impl Timeline {
break; break;
} }
// Take the min to avoid reconstructing a page with data newer than request Lsn. cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
timeline_owned = timeline timeline_owned = timeline
.get_ready_ancestor_timeline(ctx) .get_ready_ancestor_timeline(ctx)
.await .await
@@ -4704,16 +4703,23 @@ struct TimelineWriterState {
max_lsn: Option<Lsn>, max_lsn: Option<Lsn>,
// Cached details of the last freeze. Avoids going trough the atomic/lock on every put. // Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
cached_last_freeze_at: Lsn, cached_last_freeze_at: Lsn,
cached_last_freeze_ts: Instant,
} }
impl TimelineWriterState { impl TimelineWriterState {
fn new(open_layer: Arc<InMemoryLayer>, current_size: u64, last_freeze_at: Lsn) -> Self { fn new(
open_layer: Arc<InMemoryLayer>,
current_size: u64,
last_freeze_at: Lsn,
last_freeze_ts: Instant,
) -> Self {
Self { Self {
open_layer, open_layer,
current_size, current_size,
prev_lsn: None, prev_lsn: None,
max_lsn: None, max_lsn: None,
cached_last_freeze_at: last_freeze_at, cached_last_freeze_at: last_freeze_at,
cached_last_freeze_ts: last_freeze_ts,
} }
} }
} }
@@ -4812,10 +4818,12 @@ impl<'a> TimelineWriter<'a> {
let initial_size = layer.size().await?; let initial_size = layer.size().await?;
let last_freeze_at = self.last_freeze_at.load(); let last_freeze_at = self.last_freeze_at.load();
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
self.write_guard.replace(TimelineWriterState::new( self.write_guard.replace(TimelineWriterState::new(
layer, layer,
initial_size, initial_size,
last_freeze_at, last_freeze_at,
last_freeze_ts,
)); ));
Ok(()) Ok(())
@@ -4862,7 +4870,7 @@ impl<'a> TimelineWriter<'a> {
self.get_checkpoint_distance(), self.get_checkpoint_distance(),
lsn, lsn,
state.cached_last_freeze_at, state.cached_last_freeze_at,
state.open_layer.get_opened_at(), state.cached_last_freeze_ts,
) { ) {
OpenLayerAction::Roll OpenLayerAction::Roll
} else { } else {

View File

@@ -12,6 +12,7 @@ use super::layer_manager::LayerManager;
use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline}; use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
use anyhow::{anyhow, Context}; use anyhow::{anyhow, Context};
use async_trait::async_trait;
use enumset::EnumSet; use enumset::EnumSet;
use fail::fail_point; use fail::fail_point;
use itertools::Itertools; use itertools::Itertools;
@@ -1121,6 +1122,7 @@ impl CompactionLayer<Key> for ResidentDeltaLayer {
} }
} }
#[async_trait]
impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer { impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
type DeltaEntry<'a> = DeltaEntry<'a>; type DeltaEntry<'a> = DeltaEntry<'a>;

View File

@@ -378,7 +378,7 @@ impl Timeline {
gate: &GateGuard, gate: &GateGuard,
ctx: &RequestContext, ctx: &RequestContext,
) -> ControlFlow<()> { ) -> ControlFlow<()> {
if !self.tenant_shard_id.is_shard_zero() { if !self.tenant_shard_id.is_zero() {
// Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size
// for consumption metrics (consumption metrics are only sent from shard 0). We may therefore // for consumption metrics (consumption metrics are only sent from shard 0). We may therefore
// skip imitating logical size accesses for eviction purposes. // skip imitating logical size accesses for eviction purposes.

View File

@@ -427,7 +427,7 @@ pub(super) async fn handle_walreceiver_connection(
// Send the replication feedback message. // Send the replication feedback message.
// Regular standby_status_update fields are put into this message. // Regular standby_status_update fields are put into this message.
let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() { let current_timeline_size = if timeline.tenant_shard_id.is_zero() {
timeline timeline
.get_current_logical_size( .get_current_logical_size(
crate::tenant::timeline::GetLogicalSizePriority::User, crate::tenant::timeline::GetLogicalSizePriority::User,

View File

@@ -61,18 +61,18 @@ pub struct VectoredRead {
} }
impl VectoredRead { impl VectoredRead {
pub(crate) fn size(&self) -> usize { pub fn size(&self) -> usize {
(self.end - self.start) as usize (self.end - self.start) as usize
} }
} }
#[derive(Eq, PartialEq)] #[derive(Eq, PartialEq)]
pub(crate) enum VectoredReadExtended { enum VectoredReadExtended {
Yes, Yes,
No, No,
} }
pub(crate) struct VectoredReadBuilder { struct VectoredReadBuilder {
start: u64, start: u64,
end: u64, end: u64,
blobs_at: VecMap<u64, BlobMeta>, blobs_at: VecMap<u64, BlobMeta>,
@@ -80,17 +80,7 @@ pub(crate) struct VectoredReadBuilder {
} }
impl VectoredReadBuilder { impl VectoredReadBuilder {
/// Start building a new vectored read. fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
///
/// Note that by design, this does not check against reading more than `max_read_size` to
/// support reading larger blobs than the configuration value. The builder will be single use
/// however after that.
pub(crate) fn new(
start_offset: u64,
end_offset: u64,
meta: BlobMeta,
max_read_size: usize,
) -> Self {
let mut blobs_at = VecMap::default(); let mut blobs_at = VecMap::default();
blobs_at blobs_at
.append(start_offset, meta) .append(start_offset, meta)
@@ -107,8 +97,7 @@ impl VectoredReadBuilder {
/// Attempt to extend the current read with a new blob if the start /// Attempt to extend the current read with a new blob if the start
/// offset matches with the current end of the vectored read /// offset matches with the current end of the vectored read
/// and the resuting size is below the max read size /// and the resuting size is below the max read size
pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
tracing::trace!(start, end, "trying to extend");
let size = (end - start) as usize; let size = (end - start) as usize;
if self.end == start && self.size() + size <= self.max_read_size { if self.end == start && self.size() + size <= self.max_read_size {
self.end = end; self.end = end;
@@ -122,11 +111,11 @@ impl VectoredReadBuilder {
VectoredReadExtended::No VectoredReadExtended::No
} }
pub(crate) fn size(&self) -> usize { fn size(&self) -> usize {
(self.end - self.start) as usize (self.end - self.start) as usize
} }
pub(crate) fn build(self) -> VectoredRead { fn build(self) -> VectoredRead {
VectoredRead { VectoredRead {
start: self.start, start: self.start,
end: self.end, end: self.end,

View File

@@ -41,7 +41,7 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz
// //
// note that u64::MAX will be output as i64::MAX as u64, but that should not matter // note that u64::MAX will be output as i64::MAX as u64, but that should not matter
utilization_score: u64::MAX, utilization_score: u64::MAX,
captured_at: utils::serde_system_time::SystemTime(captured_at), captured_at,
}; };
// TODO: make utilization_score into a metric // TODO: make utilization_score into a metric

View File

@@ -403,7 +403,7 @@ impl WalIngest {
); );
if !key_is_local { if !key_is_local {
if self.shard.is_shard_zero() { if self.shard.is_zero() {
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe // Shard 0 tracks relation sizes. Although we will not store this block, we will observe
// its blkno in case it implicitly extends a relation. // its blkno in case it implicitly extends a relation.
self.observe_decoded_block(modification, blk, ctx).await?; self.observe_decoded_block(modification, blk, ctx).await?;

View File

@@ -55,7 +55,6 @@ impl NeonWalRecord {
/// Does replaying this WAL record initialize the page from scratch, or does /// Does replaying this WAL record initialize the page from scratch, or does
/// it need to be applied over the previous image of the page? /// it need to be applied over the previous image of the page?
pub fn will_init(&self) -> bool { pub fn will_init(&self) -> bool {
// If you change this function, you'll also need to change ValueBytes::will_init
match self { match self {
NeonWalRecord::Postgres { will_init, rec: _ } => *will_init, NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,

View File

@@ -20,7 +20,6 @@
/// Process lifecycle and abstracction for the IPC protocol. /// Process lifecycle and abstracction for the IPC protocol.
mod process; mod process;
pub use process::Kind as ProcessKind;
/// Code to apply [`NeonWalRecord`]s. /// Code to apply [`NeonWalRecord`]s.
pub(crate) mod apply_neon; pub(crate) mod apply_neon;
@@ -35,7 +34,7 @@ use crate::walrecord::NeonWalRecord;
use anyhow::Context; use anyhow::Context;
use bytes::{Bytes, BytesMut}; use bytes::{Bytes, BytesMut};
use pageserver_api::key::key_to_rel_block; use pageserver_api::key::key_to_rel_block;
use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; use pageserver_api::models::WalRedoManagerStatus;
use pageserver_api::shard::TenantShardId; use pageserver_api::shard::TenantShardId;
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
@@ -55,7 +54,7 @@ pub struct PostgresRedoManager {
tenant_shard_id: TenantShardId, tenant_shard_id: TenantShardId,
conf: &'static PageServerConf, conf: &'static PageServerConf,
last_redo_at: std::sync::Mutex<Option<Instant>>, last_redo_at: std::sync::Mutex<Option<Instant>>,
/// The current [`process::Process`] that is used by new redo requests. /// The current [`process::WalRedoProcess`] that is used by new redo requests.
/// We use [`heavier_once_cell`] for coalescing the spawning, but the redo /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
/// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
/// their process object; we use [`Arc::clone`] for that. /// their process object; we use [`Arc::clone`] for that.
@@ -67,7 +66,7 @@ pub struct PostgresRedoManager {
/// still be using the old redo process. But, those other tasks will most likely /// still be using the old redo process. But, those other tasks will most likely
/// encounter an error as well, and errors are an unexpected condition anyway. /// encounter an error as well, and errors are an unexpected condition anyway.
/// So, probably we could get rid of the `Arc` in the future. /// So, probably we could get rid of the `Arc` in the future.
redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>, redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
} }
/// ///
@@ -140,8 +139,8 @@ impl PostgresRedoManager {
} }
} }
pub fn status(&self) -> WalRedoManagerStatus { pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
WalRedoManagerStatus { Some(WalRedoManagerStatus {
last_redo_at: { last_redo_at: {
let at = *self.last_redo_at.lock().unwrap(); let at = *self.last_redo_at.lock().unwrap();
at.and_then(|at| { at.and_then(|at| {
@@ -150,14 +149,8 @@ impl PostgresRedoManager {
chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?) chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
}) })
}, },
process: self pid: self.redo_process.get().map(|p| p.id()),
.redo_process })
.get()
.map(|p| WalRedoManagerProcessStatus {
pid: p.id(),
kind: std::borrow::Cow::Borrowed(p.kind().into()),
}),
}
} }
} }
@@ -215,33 +208,37 @@ impl PostgresRedoManager {
const MAX_RETRY_ATTEMPTS: u32 = 1; const MAX_RETRY_ATTEMPTS: u32 = 1;
let mut n_attempts = 0u32; let mut n_attempts = 0u32;
loop { loop {
let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await { let proc: Arc<process::WalRedoProcess> =
Ok(guard) => Arc::clone(&guard), match self.redo_process.get_or_init_detached().await {
Err(permit) => { Ok(guard) => Arc::clone(&guard),
// don't hold poison_guard, the launch code can bail Err(permit) => {
let start = Instant::now(); // don't hold poison_guard, the launch code can bail
let proc = Arc::new( let start = Instant::now();
process::Process::launch(self.conf, self.tenant_shard_id, pg_version) let proc = Arc::new(
process::WalRedoProcess::launch(
self.conf,
self.tenant_shard_id,
pg_version,
)
.context("launch walredo process")?, .context("launch walredo process")?,
); );
let duration = start.elapsed(); let duration = start.elapsed();
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
info!( info!(
duration_ms = duration.as_millis(), duration_ms = duration.as_millis(),
pid = proc.id(), pid = proc.id(),
"launched walredo process" "launched walredo process"
); );
self.redo_process.set(Arc::clone(&proc), permit); self.redo_process.set(Arc::clone(&proc), permit);
proc proc
} }
}; };
let started_at = std::time::Instant::now(); let started_at = std::time::Instant::now();
// Relational WAL records are applied using wal-redo-postgres // Relational WAL records are applied using wal-redo-postgres
let result = proc let result = proc
.apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout) .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
.await
.context("apply_wal_records"); .context("apply_wal_records");
let duration = started_at.elapsed(); let duration = started_at.elapsed();

View File

@@ -1,67 +1,186 @@
use std::time::Duration; use self::no_leak_child::NoLeakChild;
use crate::{
config::PageServerConf,
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
walrecord::NeonWalRecord,
};
use anyhow::Context;
use bytes::Bytes; use bytes::Bytes;
use nix::poll::{PollFd, PollFlags};
use pageserver_api::{reltag::RelTag, shard::TenantShardId}; use pageserver_api::{reltag::RelTag, shard::TenantShardId};
use utils::lsn::Lsn; use postgres_ffi::BLCKSZ;
use std::os::fd::AsRawFd;
use crate::{config::PageServerConf, walrecord::NeonWalRecord}; #[cfg(feature = "testing")]
use std::sync::atomic::AtomicUsize;
use std::{
collections::VecDeque,
io::{Read, Write},
process::{ChildStdin, ChildStdout, Command, Stdio},
sync::{Mutex, MutexGuard},
time::Duration,
};
use tracing::{debug, error, instrument, Instrument};
use utils::{lsn::Lsn, nonblock::set_nonblock};
mod no_leak_child; mod no_leak_child;
/// The IPC protocol that pageserver and walredo process speak over their shared pipe. /// The IPC protocol that pageserver and walredo process speak over their shared pipe.
mod protocol; mod protocol;
mod process_impl { pub struct WalRedoProcess {
pub(super) mod process_async; #[allow(dead_code)]
pub(super) mod process_std; conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
// Some() on construction, only becomes None on Drop.
child: Option<NoLeakChild>,
stdout: Mutex<ProcessOutput>,
stdin: Mutex<ProcessInput>,
/// Counter to separate same sized walredo inputs failing at the same millisecond.
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize,
} }
#[derive( struct ProcessInput {
Clone, stdin: ChildStdin,
Copy, n_requests: usize,
Debug,
PartialEq,
Eq,
strum_macros::EnumString,
strum_macros::Display,
strum_macros::IntoStaticStr,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
)]
#[strum(serialize_all = "kebab-case")]
#[repr(u8)]
pub enum Kind {
Sync,
Async,
} }
pub(crate) enum Process { struct ProcessOutput {
Sync(process_impl::process_std::WalRedoProcess), stdout: ChildStdout,
Async(process_impl::process_async::WalRedoProcess), pending_responses: VecDeque<Option<Bytes>>,
n_processed_responses: usize,
} }
impl Process { impl WalRedoProcess {
#[inline(always)] //
pub fn launch( // Start postgres binary in special WAL redo mode.
//
#[instrument(skip_all,fields(pg_version=pg_version))]
pub(crate) fn launch(
conf: &'static PageServerConf, conf: &'static PageServerConf,
tenant_shard_id: TenantShardId, tenant_shard_id: TenantShardId,
pg_version: u32, pg_version: u32,
) -> anyhow::Result<Self> { ) -> anyhow::Result<Self> {
Ok(match conf.walredo_process_kind { crate::span::debug_assert_current_span_has_tenant_id();
Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
conf, let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
tenant_shard_id, let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
pg_version,
)?), use no_leak_child::NoLeakChildCommandExt;
Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch( // Start postgres itself
conf, let child = Command::new(pg_bin_dir_path.join("postgres"))
tenant_shard_id, // the first arg must be --wal-redo so the child process enters into walredo mode
pg_version, .arg("--wal-redo")
)?), // the child doesn't process this arg, but, having it in the argv helps indentify the
// walredo process for a particular tenant when debugging a pagserver
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
.stdin(Stdio::piped())
.stderr(Stdio::piped())
.stdout(Stdio::piped())
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
// NB: The redo process is not trusted after we sent it the first
// walredo work. Before that, it is trusted. Specifically, we trust
// it to
// 1. close all file descriptors except stdin, stdout, stderr because
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
// the files it opens, and
// 2. to use seccomp to sandbox itself before processing the first
// walredo request.
.spawn_no_leak_child(tenant_shard_id)
.context("spawn process")?;
WAL_REDO_PROCESS_COUNTERS.started.inc();
let mut child = scopeguard::guard(child, |child| {
error!("killing wal-redo-postgres process due to a problem during launch");
child.kill_and_wait(WalRedoKillCause::Startup);
});
let stdin = child.stdin.take().unwrap();
let stdout = child.stdout.take().unwrap();
let stderr = child.stderr.take().unwrap();
let stderr = tokio::process::ChildStderr::from_std(stderr)
.context("convert to tokio::ChildStderr")?;
macro_rules! set_nonblock_or_log_err {
($file:ident) => {{
let res = set_nonblock($file.as_raw_fd());
if let Err(e) = &res {
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
}
res
}};
}
set_nonblock_or_log_err!(stdin)?;
set_nonblock_or_log_err!(stdout)?;
// all fallible operations post-spawn are complete, so get rid of the guard
let child = scopeguard::ScopeGuard::into_inner(child);
tokio::spawn(
async move {
scopeguard::defer! {
debug!("wal-redo-postgres stderr_logger_task finished");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
}
debug!("wal-redo-postgres stderr_logger_task started");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
use tokio::io::AsyncBufReadExt;
let mut stderr_lines = tokio::io::BufReader::new(stderr);
let mut buf = Vec::new();
let res = loop {
buf.clear();
// TODO we don't trust the process to cap its stderr length.
// Currently it can do unbounded Vec allocation.
match stderr_lines.read_until(b'\n', &mut buf).await {
Ok(0) => break Ok(()), // eof
Ok(num_bytes) => {
let output = String::from_utf8_lossy(&buf[..num_bytes]);
error!(%output, "received output");
}
Err(e) => {
break Err(e);
}
}
};
match res {
Ok(()) => (),
Err(e) => {
error!(error=?e, "failed to read from walredo stderr");
}
}
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
);
Ok(Self {
conf,
tenant_shard_id,
child: Some(child),
stdin: Mutex::new(ProcessInput {
stdin,
n_requests: 0,
}),
stdout: Mutex::new(ProcessOutput {
stdout,
pending_responses: VecDeque::new(),
n_processed_responses: 0,
}),
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize::default(),
}) })
} }
#[inline(always)] pub(crate) fn id(&self) -> u32 {
pub(crate) async fn apply_wal_records( self.child
.as_ref()
.expect("must not call this during Drop")
.id()
}
// Apply given WAL records ('records') over an old page image. Returns
// new page image.
//
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
pub(crate) fn apply_wal_records(
&self, &self,
rel: RelTag, rel: RelTag,
blknum: u32, blknum: u32,
@@ -69,29 +188,221 @@ impl Process {
records: &[(Lsn, NeonWalRecord)], records: &[(Lsn, NeonWalRecord)],
wal_redo_timeout: Duration, wal_redo_timeout: Duration,
) -> anyhow::Result<Bytes> { ) -> anyhow::Result<Bytes> {
match self { let tag = protocol::BufferTag { rel, blknum };
Process::Sync(p) => { let input = self.stdin.lock().unwrap();
p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
.await // Serialize all the messages to send the WAL redo process first.
//
// This could be problematic if there are millions of records to replay,
// but in practice the number of records is usually so small that it doesn't
// matter, and it's better to keep this code simple.
//
// Most requests start with a before-image with BLCKSZ bytes, followed by
// by some other WAL records. Start with a buffer that can hold that
// comfortably.
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
if let Some(img) = base_img {
protocol::build_push_page_msg(tag, img, &mut writebuf);
}
for (lsn, rec) in records.iter() {
if let NeonWalRecord::Postgres {
will_init: _,
rec: postgres_rec,
} = rec
{
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
} else {
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
} }
Process::Async(p) => { }
p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) protocol::build_get_page_msg(tag, &mut writebuf);
.await WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
if res.is_err() {
// not all of these can be caused by this particular input, however these are so rare
// in tests so capture all.
self.record_and_log(&writebuf);
}
res
}
fn apply_wal_records0(
&self,
writebuf: &[u8],
input: MutexGuard<ProcessInput>,
wal_redo_timeout: Duration,
) -> anyhow::Result<Bytes> {
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
let mut nwrite = 0usize;
while nwrite < writebuf.len() {
let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
let n = loop {
match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
Err(nix::errno::Errno::EINTR) => continue,
res => break res,
}
}?;
if n == 0 {
anyhow::bail!("WAL redo timed out");
} }
// If 'stdin' is writeable, do write.
let in_revents = stdin_pollfds[0].revents().unwrap();
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
}
if in_revents.contains(PollFlags::POLLHUP) {
// We still have more data to write, but the process closed the pipe.
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
}
}
let request_no = proc.n_requests;
proc.n_requests += 1;
drop(proc);
// To improve walredo performance we separate sending requests and receiving
// responses. Them are protected by different mutexes (output and input).
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
// then there is not warranty that T1 will first granted output mutex lock.
// To address this issue we maintain number of sent requests, number of processed
// responses and ring buffer with pending responses. After sending response
// (under input mutex), threads remembers request number. Then it releases
// input mutex, locks output mutex and fetch in ring buffer all responses until
// its stored request number. The it takes correspondent element from
// pending responses ring buffer and truncate all empty elements from the front,
// advancing processed responses number.
let mut output = self.stdout.lock().unwrap();
let n_processed_responses = output.n_processed_responses;
while n_processed_responses + output.pending_responses.len() <= request_no {
// We expect the WAL redo process to respond with an 8k page image. We read it
// into this buffer.
let mut resultbuf = vec![0; BLCKSZ.into()];
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
while nresult < BLCKSZ.into() {
let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
// We do two things simultaneously: reading response from stdout
// and forward any logging information that the child writes to its stderr to the page server's log.
let n = loop {
match nix::poll::poll(
&mut stdout_pollfds[..],
wal_redo_timeout.as_millis() as i32,
) {
Err(nix::errno::Errno::EINTR) => continue,
res => break res,
}
}?;
if n == 0 {
anyhow::bail!("WAL redo timed out");
}
// If we have some data in stdout, read it to the result buffer.
let out_revents = stdout_pollfds[0].revents().unwrap();
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
}
if out_revents.contains(PollFlags::POLLHUP) {
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
}
}
output
.pending_responses
.push_back(Some(Bytes::from(resultbuf)));
}
// Replace our request's response with None in `pending_responses`.
// Then make space in the ring buffer by clearing out any seqence of contiguous
// `None`'s from the front of `pending_responses`.
// NB: We can't pop_front() because other requests' responses because another
// requester might have grabbed the output mutex before us:
// T1: grab input mutex
// T1: send request_no 23
// T1: release input mutex
// T2: grab input mutex
// T2: send request_no 24
// T2: release input mutex
// T2: grab output mutex
// T2: n_processed_responses + output.pending_responses.len() <= request_no
// 23 0 24
// T2: enters poll loop that reads stdout
// T2: put response for 23 into pending_responses
// T2: put response for 24 into pending_resposnes
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
// T2: takes its response_24
// pending_responses now looks like this: Front Some(response_23) None Back
// T2: does the while loop below
// pending_responses now looks like this: Front Some(response_23) None Back
// T2: releases output mutex
// T1: grabs output mutex
// T1: n_processed_responses + output.pending_responses.len() > request_no
// 23 2 23
// T1: skips poll loop that reads stdout
// T1: takes its response_23
// pending_responses now looks like this: Front None None Back
// T2: does the while loop below
// pending_responses now looks like this: Front Back
// n_processed_responses now has value 25
let res = output.pending_responses[request_no - n_processed_responses]
.take()
.expect("we own this request_no, nobody else is supposed to take it");
while let Some(front) = output.pending_responses.front() {
if front.is_none() {
output.pending_responses.pop_front();
output.n_processed_responses += 1;
} else {
break;
}
}
Ok(res)
}
#[cfg(feature = "testing")]
fn record_and_log(&self, writebuf: &[u8]) {
use std::sync::atomic::Ordering;
let millis = std::time::SystemTime::now()
.duration_since(std::time::SystemTime::UNIX_EPOCH)
.unwrap()
.as_millis();
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
// these files will be collected to an allure report
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
let res = std::fs::OpenOptions::new()
.write(true)
.create_new(true)
.read(true)
.open(path)
.and_then(|mut f| f.write_all(writebuf));
// trip up allowed_errors
if let Err(e) = res {
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
} else {
tracing::error!(filename, "erroring walredo input saved");
} }
} }
pub(crate) fn id(&self) -> u32 { #[cfg(not(feature = "testing"))]
match self { fn record_and_log(&self, _: &[u8]) {}
Process::Sync(p) => p.id(), }
Process::Async(p) => p.id(),
}
}
pub(crate) fn kind(&self) -> Kind { impl Drop for WalRedoProcess {
match self { fn drop(&mut self) {
Process::Sync(_) => Kind::Sync, self.child
Process::Async(_) => Kind::Async, .take()
} .expect("we only do this once")
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
// no way to wait for stderr_logger_task from Drop because that is async only
} }
} }

View File

@@ -1,374 +0,0 @@
use self::no_leak_child::NoLeakChild;
use crate::{
config::PageServerConf,
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
walrecord::NeonWalRecord,
walredo::process::{no_leak_child, protocol},
};
use anyhow::Context;
use bytes::Bytes;
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
use postgres_ffi::BLCKSZ;
#[cfg(feature = "testing")]
use std::sync::atomic::AtomicUsize;
use std::{
collections::VecDeque,
process::{Command, Stdio},
time::Duration,
};
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tracing::{debug, error, instrument, Instrument};
use utils::{lsn::Lsn, poison::Poison};
pub struct WalRedoProcess {
#[allow(dead_code)]
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
// Some() on construction, only becomes None on Drop.
child: Option<NoLeakChild>,
stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
/// Counter to separate same sized walredo inputs failing at the same millisecond.
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize,
}
struct ProcessInput {
stdin: tokio::process::ChildStdin,
n_requests: usize,
}
struct ProcessOutput {
stdout: tokio::process::ChildStdout,
pending_responses: VecDeque<Option<Bytes>>,
n_processed_responses: usize,
}
impl WalRedoProcess {
//
// Start postgres binary in special WAL redo mode.
//
#[instrument(skip_all,fields(pg_version=pg_version))]
pub(crate) fn launch(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
pg_version: u32,
) -> anyhow::Result<Self> {
crate::span::debug_assert_current_span_has_tenant_id();
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
use no_leak_child::NoLeakChildCommandExt;
// Start postgres itself
let child = Command::new(pg_bin_dir_path.join("postgres"))
// the first arg must be --wal-redo so the child process enters into walredo mode
.arg("--wal-redo")
// the child doesn't process this arg, but, having it in the argv helps indentify the
// walredo process for a particular tenant when debugging a pagserver
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
.stdin(Stdio::piped())
.stderr(Stdio::piped())
.stdout(Stdio::piped())
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
// NB: The redo process is not trusted after we sent it the first
// walredo work. Before that, it is trusted. Specifically, we trust
// it to
// 1. close all file descriptors except stdin, stdout, stderr because
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
// the files it opens, and
// 2. to use seccomp to sandbox itself before processing the first
// walredo request.
.spawn_no_leak_child(tenant_shard_id)
.context("spawn process")?;
WAL_REDO_PROCESS_COUNTERS.started.inc();
let mut child = scopeguard::guard(child, |child| {
error!("killing wal-redo-postgres process due to a problem during launch");
child.kill_and_wait(WalRedoKillCause::Startup);
});
let stdin = child.stdin.take().unwrap();
let stdout = child.stdout.take().unwrap();
let stderr = child.stderr.take().unwrap();
let stderr = tokio::process::ChildStderr::from_std(stderr)
.context("convert to tokio::ChildStderr")?;
let stdin =
tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
let stdout = tokio::process::ChildStdout::from_std(stdout)
.context("convert to tokio::ChildStdout")?;
// all fallible operations post-spawn are complete, so get rid of the guard
let child = scopeguard::ScopeGuard::into_inner(child);
tokio::spawn(
async move {
scopeguard::defer! {
debug!("wal-redo-postgres stderr_logger_task finished");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
}
debug!("wal-redo-postgres stderr_logger_task started");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
use tokio::io::AsyncBufReadExt;
let mut stderr_lines = tokio::io::BufReader::new(stderr);
let mut buf = Vec::new();
let res = loop {
buf.clear();
// TODO we don't trust the process to cap its stderr length.
// Currently it can do unbounded Vec allocation.
match stderr_lines.read_until(b'\n', &mut buf).await {
Ok(0) => break Ok(()), // eof
Ok(num_bytes) => {
let output = String::from_utf8_lossy(&buf[..num_bytes]);
error!(%output, "received output");
}
Err(e) => {
break Err(e);
}
}
};
match res {
Ok(()) => (),
Err(e) => {
error!(error=?e, "failed to read from walredo stderr");
}
}
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
);
Ok(Self {
conf,
tenant_shard_id,
child: Some(child),
stdin: tokio::sync::Mutex::new(Poison::new(
"stdin",
ProcessInput {
stdin,
n_requests: 0,
},
)),
stdout: tokio::sync::Mutex::new(Poison::new(
"stdout",
ProcessOutput {
stdout,
pending_responses: VecDeque::new(),
n_processed_responses: 0,
},
)),
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize::default(),
})
}
pub(crate) fn id(&self) -> u32 {
self.child
.as_ref()
.expect("must not call this during Drop")
.id()
}
/// Apply given WAL records ('records') over an old page image. Returns
/// new page image.
///
/// # Cancel-Safety
///
/// Cancellation safe.
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
pub(crate) async fn apply_wal_records(
&self,
rel: RelTag,
blknum: u32,
base_img: &Option<Bytes>,
records: &[(Lsn, NeonWalRecord)],
wal_redo_timeout: Duration,
) -> anyhow::Result<Bytes> {
let tag = protocol::BufferTag { rel, blknum };
// Serialize all the messages to send the WAL redo process first.
//
// This could be problematic if there are millions of records to replay,
// but in practice the number of records is usually so small that it doesn't
// matter, and it's better to keep this code simple.
//
// Most requests start with a before-image with BLCKSZ bytes, followed by
// by some other WAL records. Start with a buffer that can hold that
// comfortably.
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
if let Some(img) = base_img {
protocol::build_push_page_msg(tag, img, &mut writebuf);
}
for (lsn, rec) in records.iter() {
if let NeonWalRecord::Postgres {
will_init: _,
rec: postgres_rec,
} = rec
{
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
} else {
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
}
}
protocol::build_get_page_msg(tag, &mut writebuf);
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
let Ok(res) =
tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
else {
anyhow::bail!("WAL redo timed out");
};
if res.is_err() {
// not all of these can be caused by this particular input, however these are so rare
// in tests so capture all.
self.record_and_log(&writebuf);
}
res
}
/// # Cancel-Safety
///
/// When not polled to completion (e.g. because in `tokio::select!` another
/// branch becomes ready before this future), concurrent and subsequent
/// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
/// Dispose of this process instance and create a new one.
async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
let request_no = {
let mut lock_guard = self.stdin.lock().await;
let mut poison_guard = lock_guard.check_and_arm()?;
let input = poison_guard.data_mut();
input
.stdin
.write_all(writebuf)
.await
.context("write to walredo stdin")?;
let request_no = input.n_requests;
input.n_requests += 1;
poison_guard.disarm();
request_no
};
// To improve walredo performance we separate sending requests and receiving
// responses. Them are protected by different mutexes (output and input).
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
// then there is not warranty that T1 will first granted output mutex lock.
// To address this issue we maintain number of sent requests, number of processed
// responses and ring buffer with pending responses. After sending response
// (under input mutex), threads remembers request number. Then it releases
// input mutex, locks output mutex and fetch in ring buffer all responses until
// its stored request number. The it takes correspondent element from
// pending responses ring buffer and truncate all empty elements from the front,
// advancing processed responses number.
let mut lock_guard = self.stdout.lock().await;
let mut poison_guard = lock_guard.check_and_arm()?;
let output = poison_guard.data_mut();
let n_processed_responses = output.n_processed_responses;
while n_processed_responses + output.pending_responses.len() <= request_no {
// We expect the WAL redo process to respond with an 8k page image. We read it
// into this buffer.
let mut resultbuf = vec![0; BLCKSZ.into()];
output
.stdout
.read_exact(&mut resultbuf)
.await
.context("read walredo stdout")?;
output
.pending_responses
.push_back(Some(Bytes::from(resultbuf)));
}
// Replace our request's response with None in `pending_responses`.
// Then make space in the ring buffer by clearing out any seqence of contiguous
// `None`'s from the front of `pending_responses`.
// NB: We can't pop_front() because other requests' responses because another
// requester might have grabbed the output mutex before us:
// T1: grab input mutex
// T1: send request_no 23
// T1: release input mutex
// T2: grab input mutex
// T2: send request_no 24
// T2: release input mutex
// T2: grab output mutex
// T2: n_processed_responses + output.pending_responses.len() <= request_no
// 23 0 24
// T2: enters poll loop that reads stdout
// T2: put response for 23 into pending_responses
// T2: put response for 24 into pending_resposnes
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
// T2: takes its response_24
// pending_responses now looks like this: Front Some(response_23) None Back
// T2: does the while loop below
// pending_responses now looks like this: Front Some(response_23) None Back
// T2: releases output mutex
// T1: grabs output mutex
// T1: n_processed_responses + output.pending_responses.len() > request_no
// 23 2 23
// T1: skips poll loop that reads stdout
// T1: takes its response_23
// pending_responses now looks like this: Front None None Back
// T2: does the while loop below
// pending_responses now looks like this: Front Back
// n_processed_responses now has value 25
let res = output.pending_responses[request_no - n_processed_responses]
.take()
.expect("we own this request_no, nobody else is supposed to take it");
while let Some(front) = output.pending_responses.front() {
if front.is_none() {
output.pending_responses.pop_front();
output.n_processed_responses += 1;
} else {
break;
}
}
poison_guard.disarm();
Ok(res)
}
#[cfg(feature = "testing")]
fn record_and_log(&self, writebuf: &[u8]) {
use std::sync::atomic::Ordering;
let millis = std::time::SystemTime::now()
.duration_since(std::time::SystemTime::UNIX_EPOCH)
.unwrap()
.as_millis();
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
// these files will be collected to an allure report
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
use std::io::Write;
let res = std::fs::OpenOptions::new()
.write(true)
.create_new(true)
.read(true)
.open(path)
.and_then(|mut f| f.write_all(writebuf));
// trip up allowed_errors
if let Err(e) = res {
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
} else {
tracing::error!(filename, "erroring walredo input saved");
}
}
#[cfg(not(feature = "testing"))]
fn record_and_log(&self, _: &[u8]) {}
}
impl Drop for WalRedoProcess {
fn drop(&mut self) {
self.child
.take()
.expect("we only do this once")
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
// no way to wait for stderr_logger_task from Drop because that is async only
}
}

View File

@@ -1,405 +0,0 @@
use self::no_leak_child::NoLeakChild;
use crate::{
config::PageServerConf,
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
walrecord::NeonWalRecord,
walredo::process::{no_leak_child, protocol},
};
use anyhow::Context;
use bytes::Bytes;
use nix::poll::{PollFd, PollFlags};
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
use postgres_ffi::BLCKSZ;
use std::os::fd::AsRawFd;
#[cfg(feature = "testing")]
use std::sync::atomic::AtomicUsize;
use std::{
collections::VecDeque,
io::{Read, Write},
process::{ChildStdin, ChildStdout, Command, Stdio},
sync::{Mutex, MutexGuard},
time::Duration,
};
use tracing::{debug, error, instrument, Instrument};
use utils::{lsn::Lsn, nonblock::set_nonblock};
pub struct WalRedoProcess {
#[allow(dead_code)]
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
// Some() on construction, only becomes None on Drop.
child: Option<NoLeakChild>,
stdout: Mutex<ProcessOutput>,
stdin: Mutex<ProcessInput>,
/// Counter to separate same sized walredo inputs failing at the same millisecond.
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize,
}
struct ProcessInput {
stdin: ChildStdin,
n_requests: usize,
}
struct ProcessOutput {
stdout: ChildStdout,
pending_responses: VecDeque<Option<Bytes>>,
n_processed_responses: usize,
}
impl WalRedoProcess {
//
// Start postgres binary in special WAL redo mode.
//
#[instrument(skip_all,fields(pg_version=pg_version))]
pub(crate) fn launch(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
pg_version: u32,
) -> anyhow::Result<Self> {
crate::span::debug_assert_current_span_has_tenant_id();
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
use no_leak_child::NoLeakChildCommandExt;
// Start postgres itself
let child = Command::new(pg_bin_dir_path.join("postgres"))
// the first arg must be --wal-redo so the child process enters into walredo mode
.arg("--wal-redo")
// the child doesn't process this arg, but, having it in the argv helps indentify the
// walredo process for a particular tenant when debugging a pagserver
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
.stdin(Stdio::piped())
.stderr(Stdio::piped())
.stdout(Stdio::piped())
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
// NB: The redo process is not trusted after we sent it the first
// walredo work. Before that, it is trusted. Specifically, we trust
// it to
// 1. close all file descriptors except stdin, stdout, stderr because
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
// the files it opens, and
// 2. to use seccomp to sandbox itself before processing the first
// walredo request.
.spawn_no_leak_child(tenant_shard_id)
.context("spawn process")?;
WAL_REDO_PROCESS_COUNTERS.started.inc();
let mut child = scopeguard::guard(child, |child| {
error!("killing wal-redo-postgres process due to a problem during launch");
child.kill_and_wait(WalRedoKillCause::Startup);
});
let stdin = child.stdin.take().unwrap();
let stdout = child.stdout.take().unwrap();
let stderr = child.stderr.take().unwrap();
let stderr = tokio::process::ChildStderr::from_std(stderr)
.context("convert to tokio::ChildStderr")?;
macro_rules! set_nonblock_or_log_err {
($file:ident) => {{
let res = set_nonblock($file.as_raw_fd());
if let Err(e) = &res {
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
}
res
}};
}
set_nonblock_or_log_err!(stdin)?;
set_nonblock_or_log_err!(stdout)?;
// all fallible operations post-spawn are complete, so get rid of the guard
let child = scopeguard::ScopeGuard::into_inner(child);
tokio::spawn(
async move {
scopeguard::defer! {
debug!("wal-redo-postgres stderr_logger_task finished");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
}
debug!("wal-redo-postgres stderr_logger_task started");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
use tokio::io::AsyncBufReadExt;
let mut stderr_lines = tokio::io::BufReader::new(stderr);
let mut buf = Vec::new();
let res = loop {
buf.clear();
// TODO we don't trust the process to cap its stderr length.
// Currently it can do unbounded Vec allocation.
match stderr_lines.read_until(b'\n', &mut buf).await {
Ok(0) => break Ok(()), // eof
Ok(num_bytes) => {
let output = String::from_utf8_lossy(&buf[..num_bytes]);
error!(%output, "received output");
}
Err(e) => {
break Err(e);
}
}
};
match res {
Ok(()) => (),
Err(e) => {
error!(error=?e, "failed to read from walredo stderr");
}
}
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
);
Ok(Self {
conf,
tenant_shard_id,
child: Some(child),
stdin: Mutex::new(ProcessInput {
stdin,
n_requests: 0,
}),
stdout: Mutex::new(ProcessOutput {
stdout,
pending_responses: VecDeque::new(),
n_processed_responses: 0,
}),
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize::default(),
})
}
pub(crate) fn id(&self) -> u32 {
self.child
.as_ref()
.expect("must not call this during Drop")
.id()
}
// Apply given WAL records ('records') over an old page image. Returns
// new page image.
//
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
pub(crate) async fn apply_wal_records(
&self,
rel: RelTag,
blknum: u32,
base_img: &Option<Bytes>,
records: &[(Lsn, NeonWalRecord)],
wal_redo_timeout: Duration,
) -> anyhow::Result<Bytes> {
let tag = protocol::BufferTag { rel, blknum };
let input = self.stdin.lock().unwrap();
// Serialize all the messages to send the WAL redo process first.
//
// This could be problematic if there are millions of records to replay,
// but in practice the number of records is usually so small that it doesn't
// matter, and it's better to keep this code simple.
//
// Most requests start with a before-image with BLCKSZ bytes, followed by
// by some other WAL records. Start with a buffer that can hold that
// comfortably.
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
if let Some(img) = base_img {
protocol::build_push_page_msg(tag, img, &mut writebuf);
}
for (lsn, rec) in records.iter() {
if let NeonWalRecord::Postgres {
will_init: _,
rec: postgres_rec,
} = rec
{
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
} else {
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
}
}
protocol::build_get_page_msg(tag, &mut writebuf);
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
if res.is_err() {
// not all of these can be caused by this particular input, however these are so rare
// in tests so capture all.
self.record_and_log(&writebuf);
}
res
}
fn apply_wal_records0(
&self,
writebuf: &[u8],
input: MutexGuard<ProcessInput>,
wal_redo_timeout: Duration,
) -> anyhow::Result<Bytes> {
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
let mut nwrite = 0usize;
while nwrite < writebuf.len() {
let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
let n = loop {
match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
Err(nix::errno::Errno::EINTR) => continue,
res => break res,
}
}?;
if n == 0 {
anyhow::bail!("WAL redo timed out");
}
// If 'stdin' is writeable, do write.
let in_revents = stdin_pollfds[0].revents().unwrap();
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
}
if in_revents.contains(PollFlags::POLLHUP) {
// We still have more data to write, but the process closed the pipe.
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
}
}
let request_no = proc.n_requests;
proc.n_requests += 1;
drop(proc);
// To improve walredo performance we separate sending requests and receiving
// responses. Them are protected by different mutexes (output and input).
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
// then there is not warranty that T1 will first granted output mutex lock.
// To address this issue we maintain number of sent requests, number of processed
// responses and ring buffer with pending responses. After sending response
// (under input mutex), threads remembers request number. Then it releases
// input mutex, locks output mutex and fetch in ring buffer all responses until
// its stored request number. The it takes correspondent element from
// pending responses ring buffer and truncate all empty elements from the front,
// advancing processed responses number.
let mut output = self.stdout.lock().unwrap();
let n_processed_responses = output.n_processed_responses;
while n_processed_responses + output.pending_responses.len() <= request_no {
// We expect the WAL redo process to respond with an 8k page image. We read it
// into this buffer.
let mut resultbuf = vec![0; BLCKSZ.into()];
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
while nresult < BLCKSZ.into() {
let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
// We do two things simultaneously: reading response from stdout
// and forward any logging information that the child writes to its stderr to the page server's log.
let n = loop {
match nix::poll::poll(
&mut stdout_pollfds[..],
wal_redo_timeout.as_millis() as i32,
) {
Err(nix::errno::Errno::EINTR) => continue,
res => break res,
}
}?;
if n == 0 {
anyhow::bail!("WAL redo timed out");
}
// If we have some data in stdout, read it to the result buffer.
let out_revents = stdout_pollfds[0].revents().unwrap();
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
}
if out_revents.contains(PollFlags::POLLHUP) {
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
}
}
output
.pending_responses
.push_back(Some(Bytes::from(resultbuf)));
}
// Replace our request's response with None in `pending_responses`.
// Then make space in the ring buffer by clearing out any seqence of contiguous
// `None`'s from the front of `pending_responses`.
// NB: We can't pop_front() because other requests' responses because another
// requester might have grabbed the output mutex before us:
// T1: grab input mutex
// T1: send request_no 23
// T1: release input mutex
// T2: grab input mutex
// T2: send request_no 24
// T2: release input mutex
// T2: grab output mutex
// T2: n_processed_responses + output.pending_responses.len() <= request_no
// 23 0 24
// T2: enters poll loop that reads stdout
// T2: put response for 23 into pending_responses
// T2: put response for 24 into pending_resposnes
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
// T2: takes its response_24
// pending_responses now looks like this: Front Some(response_23) None Back
// T2: does the while loop below
// pending_responses now looks like this: Front Some(response_23) None Back
// T2: releases output mutex
// T1: grabs output mutex
// T1: n_processed_responses + output.pending_responses.len() > request_no
// 23 2 23
// T1: skips poll loop that reads stdout
// T1: takes its response_23
// pending_responses now looks like this: Front None None Back
// T2: does the while loop below
// pending_responses now looks like this: Front Back
// n_processed_responses now has value 25
let res = output.pending_responses[request_no - n_processed_responses]
.take()
.expect("we own this request_no, nobody else is supposed to take it");
while let Some(front) = output.pending_responses.front() {
if front.is_none() {
output.pending_responses.pop_front();
output.n_processed_responses += 1;
} else {
break;
}
}
Ok(res)
}
#[cfg(feature = "testing")]
fn record_and_log(&self, writebuf: &[u8]) {
use std::sync::atomic::Ordering;
let millis = std::time::SystemTime::now()
.duration_since(std::time::SystemTime::UNIX_EPOCH)
.unwrap()
.as_millis();
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
// these files will be collected to an allure report
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
let res = std::fs::OpenOptions::new()
.write(true)
.create_new(true)
.read(true)
.open(path)
.and_then(|mut f| f.write_all(writebuf));
// trip up allowed_errors
if let Err(e) = res {
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
} else {
tracing::error!(filename, "erroring walredo input saved");
}
}
#[cfg(not(feature = "testing"))]
fn record_and_log(&self, _: &[u8]) {}
}
impl Drop for WalRedoProcess {
fn drop(&mut self) {
self.child
.take()
.expect("we only do this once")
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
// no way to wait for stderr_logger_task from Drop because that is async only
}
}

175
poetry.lock generated
View File

@@ -1,88 +1,88 @@
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. # This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
[[package]] [[package]]
name = "aiohttp" name = "aiohttp"
version = "3.9.4" version = "3.9.2"
description = "Async http client/server framework (asyncio)" description = "Async http client/server framework (asyncio)"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"}, {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"},
{file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"}, {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"},
{file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"}, {file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"},
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"}, {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"},
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"}, {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"},
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"}, {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"},
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"}, {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"},
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"}, {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"},
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"}, {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"},
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"}, {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"},
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"}, {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"},
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"}, {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"},
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"}, {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"},
{file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"}, {file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"},
{file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"}, {file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"},
{file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"}, {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"},
{file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"}, {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"},
{file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"}, {file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"},
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"}, {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"},
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"}, {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"},
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"}, {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"},
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"}, {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"},
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"}, {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"},
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"}, {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"},
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"}, {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"},
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"}, {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"},
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"}, {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"},
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"}, {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"},
{file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"}, {file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"},
{file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"}, {file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"},
{file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"}, {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"},
{file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"}, {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"},
{file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"}, {file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"},
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"}, {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"},
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"}, {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"},
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"}, {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"},
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"}, {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"},
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"}, {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"},
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"}, {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"},
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"}, {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"},
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"}, {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"},
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"}, {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"},
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"}, {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"},
{file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"}, {file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"},
{file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"}, {file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"},
{file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"}, {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"},
{file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"}, {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"},
{file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"}, {file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"},
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"}, {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"},
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"}, {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"},
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"}, {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"},
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"}, {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"},
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"}, {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"},
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"}, {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"},
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"}, {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"},
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"}, {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"},
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"}, {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"},
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"}, {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"},
{file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"}, {file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"},
{file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"}, {file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"},
{file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"}, {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"},
{file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"}, {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"},
{file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"}, {file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"},
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"}, {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"},
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"}, {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"},
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"}, {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"},
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"}, {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"},
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"}, {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"},
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"}, {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"},
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"}, {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"},
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"}, {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"},
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"}, {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"},
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"}, {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"},
{file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"}, {file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"},
{file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"}, {file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"},
{file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"}, {file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"},
] ]
[package.dependencies] [package.dependencies]
@@ -1191,13 +1191,13 @@ files = [
[[package]] [[package]]
name = "idna" name = "idna"
version = "3.7" version = "3.3"
description = "Internationalized Domain Names in Applications (IDNA)" description = "Internationalized Domain Names in Applications (IDNA)"
optional = false optional = false
python-versions = ">=3.5" python-versions = ">=3.5"
files = [ files = [
{file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
{file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
] ]
[[package]] [[package]]
@@ -2182,7 +2182,6 @@ files = [
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2653,16 +2652,6 @@ files = [
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2900,4 +2889,4 @@ cffi = ["cffi (>=1.11)"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.9" python-versions = "^3.9"
content-hash = "b3452b50901123fd5f2c385ce8a0c1c492296393b8a7926a322b6df0ea3ac572" content-hash = "df7161da4fdc3cba0a445176fc9dda2a0e8a53e13a7aa8a864385ca259381b41"

View File

@@ -12,7 +12,6 @@ testing = []
anyhow.workspace = true anyhow.workspace = true
async-compression.workspace = true async-compression.workspace = true
async-trait.workspace = true async-trait.workspace = true
atomic-take.workspace = true
aws-config.workspace = true aws-config.workspace = true
aws-sdk-iam.workspace = true aws-sdk-iam.workspace = true
aws-sigv4.workspace = true aws-sigv4.workspace = true
@@ -21,7 +20,6 @@ base64.workspace = true
bstr.workspace = true bstr.workspace = true
bytes = { workspace = true, features = ["serde"] } bytes = { workspace = true, features = ["serde"] }
camino.workspace = true camino.workspace = true
camino-tempfile.workspace = true
chrono.workspace = true chrono.workspace = true
clap.workspace = true clap.workspace = true
consumption_metrics.workspace = true consumption_metrics.workspace = true
@@ -38,14 +36,10 @@ http.workspace = true
humantime.workspace = true humantime.workspace = true
hyper-tungstenite.workspace = true hyper-tungstenite.workspace = true
hyper.workspace = true hyper.workspace = true
hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
http-body-util = { version = "0.1" }
ipnet.workspace = true ipnet.workspace = true
itertools.workspace = true itertools.workspace = true
lasso = { workspace = true, features = ["multi-threaded"] } lasso = { workspace = true, features = ["multi-threaded"] }
md5.workspace = true md5.workspace = true
measured = { workspace = true, features = ["lasso"] }
metrics.workspace = true metrics.workspace = true
once_cell.workspace = true once_cell.workspace = true
opentelemetry.workspace = true opentelemetry.workspace = true
@@ -79,7 +73,7 @@ subtle.workspace = true
sync_wrapper.workspace = true sync_wrapper.workspace = true
task-local-extensions.workspace = true task-local-extensions.workspace = true
thiserror.workspace = true thiserror.workspace = true
tikv-jemallocator = { workspace = true, features = ["profiling"] } tikv-jemallocator.workspace = true
tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] } tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
tokio-postgres.workspace = true tokio-postgres.workspace = true
tokio-rustls.workspace = true tokio-rustls.workspace = true
@@ -103,6 +97,7 @@ redis.workspace = true
workspace_hack.workspace = true workspace_hack.workspace = true
[dev-dependencies] [dev-dependencies]
camino-tempfile.workspace = true
fallible-iterator.workspace = true fallible-iterator.workspace = true
rcgen.workspace = true rcgen.workspace = true
rstest.workspace = true rstest.workspace = true

View File

@@ -2,15 +2,8 @@ mod classic;
mod hacks; mod hacks;
mod link; mod link;
use std::net::IpAddr;
use std::sync::Arc;
use std::time::Duration;
use ipnet::{Ipv4Net, Ipv6Net};
pub use link::LinkAuthError; pub use link::LinkAuthError;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_postgres::config::AuthKeys; use tokio_postgres::config::AuthKeys;
use tracing::{info, warn};
use crate::auth::credentials::check_peer_addr_is_in_list; use crate::auth::credentials::check_peer_addr_is_in_list;
use crate::auth::validate_password_and_exchange; use crate::auth::validate_password_and_exchange;
@@ -20,10 +13,9 @@ use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
use crate::console::{AuthSecret, NodeInfo}; use crate::console::{AuthSecret, NodeInfo};
use crate::context::RequestMonitoring; use crate::context::RequestMonitoring;
use crate::intern::EndpointIdInt; use crate::intern::EndpointIdInt;
use crate::metrics::Metrics; use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED};
use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::connect_compute::ComputeConnectBackend;
use crate::proxy::NeonOptions; use crate::proxy::NeonOptions;
use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo};
use crate::stream::Stream; use crate::stream::Stream;
use crate::{ use crate::{
auth::{self, ComputeUserInfoMaybeEndpoint}, auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -35,7 +27,10 @@ use crate::{
}, },
stream, url, stream, url,
}; };
use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName}; use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, warn};
/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
pub enum MaybeOwned<'a, T> { pub enum MaybeOwned<'a, T> {
@@ -181,51 +176,17 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
} }
} }
#[derive(PartialEq, PartialOrd, Hash, Eq, Ord, Debug, Copy, Clone)]
pub struct MaskedIp(IpAddr);
impl MaskedIp {
fn new(value: IpAddr, prefix: u8) -> Self {
match value {
IpAddr::V4(v4) => Self(IpAddr::V4(
Ipv4Net::new(v4, prefix).map_or(v4, |x| x.trunc().addr()),
)),
IpAddr::V6(v6) => Self(IpAddr::V6(
Ipv6Net::new(v6, prefix).map_or(v6, |x| x.trunc().addr()),
)),
}
}
}
// This can't be just per IP because that would limit some PaaS that share IP addresses
pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>;
impl RateBucketInfo {
/// All of these are per endpoint-maskedip pair.
/// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
///
/// First bucket: 1000mcpus total per endpoint-ip pair
/// * 4096000 requests per second with 1 hash rounds.
/// * 1000 requests per second with 4096 hash rounds.
/// * 6.8 requests per second with 600000 hash rounds.
pub const DEFAULT_AUTH_SET: [Self; 3] = [
Self::new(1000 * 4096, Duration::from_secs(1)),
Self::new(600 * 4096, Duration::from_secs(60)),
Self::new(300 * 4096, Duration::from_secs(600)),
];
}
impl AuthenticationConfig { impl AuthenticationConfig {
pub fn check_rate_limit( pub fn check_rate_limit(
&self, &self,
ctx: &mut RequestMonitoring, ctx: &mut RequestMonitoring,
config: &AuthenticationConfig,
secret: AuthSecret, secret: AuthSecret,
endpoint: &EndpointId, endpoint: &EndpointId,
is_cleartext: bool, is_cleartext: bool,
) -> auth::Result<AuthSecret> { ) -> auth::Result<AuthSecret> {
// we have validated the endpoint exists, so let's intern it. // we have validated the endpoint exists, so let's intern it.
let endpoint_int = EndpointIdInt::from(endpoint.normalize()); let endpoint_int = EndpointIdInt::from(endpoint);
// only count the full hash count if password hack or websocket flow. // only count the full hash count if password hack or websocket flow.
// in other words, if proxy needs to run the hashing // in other words, if proxy needs to run the hashing
@@ -240,25 +201,17 @@ impl AuthenticationConfig {
1 1
}; };
let limit_not_exceeded = self.rate_limiter.check( let limit_not_exceeded = self
( .rate_limiter
endpoint_int, .check((endpoint_int, ctx.peer_addr), password_weight);
MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
),
password_weight,
);
if !limit_not_exceeded { if !limit_not_exceeded {
warn!( warn!(
enabled = self.rate_limiter_enabled, enabled = self.rate_limiter_enabled,
"rate limiting authentication" "rate limiting authentication"
); );
Metrics::get().proxy.requests_auth_rate_limits_total.inc(); AUTH_RATE_LIMIT_HITS.inc();
Metrics::get() ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint);
.proxy
.endpoints_auth_rate_limits
.get_metric()
.measure(endpoint);
if self.rate_limiter_enabled { if self.rate_limiter_enabled {
return Err(auth::AuthError::too_many_connections()); return Err(auth::AuthError::too_many_connections());
@@ -314,7 +267,6 @@ async fn auth_quirks(
let secret = match secret { let secret = match secret {
Some(secret) => config.check_rate_limit( Some(secret) => config.check_rate_limit(
ctx, ctx,
config,
secret, secret,
&info.endpoint, &info.endpoint,
unauthenticated_password.is_some() || allow_cleartext, unauthenticated_password.is_some() || allow_cleartext,
@@ -517,7 +469,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::{net::IpAddr, sync::Arc, time::Duration}; use std::sync::Arc;
use bytes::BytesMut; use bytes::BytesMut;
use fallible_iterator::FallibleIterator; use fallible_iterator::FallibleIterator;
@@ -530,7 +482,7 @@ mod tests {
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
use crate::{ use crate::{
auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern}, auth::{ComputeUserInfoMaybeEndpoint, IpPattern},
config::AuthenticationConfig, config::AuthenticationConfig,
console::{ console::{
self, self,
@@ -539,12 +491,12 @@ mod tests {
}, },
context::RequestMonitoring, context::RequestMonitoring,
proxy::NeonOptions, proxy::NeonOptions,
rate_limiter::RateBucketInfo, rate_limiter::{AuthRateLimiter, RateBucketInfo},
scram::ServerSecret, scram::ServerSecret,
stream::{PqStream, Stream}, stream::{PqStream, Stream},
}; };
use super::{auth_quirks, AuthRateLimiter}; use super::auth_quirks;
struct Auth { struct Auth {
ips: Vec<IpPattern>, ips: Vec<IpPattern>,
@@ -585,7 +537,6 @@ mod tests {
scram_protocol_timeout: std::time::Duration::from_secs(5), scram_protocol_timeout: std::time::Duration::from_secs(5),
rate_limiter_enabled: true, rate_limiter_enabled: true,
rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
rate_limit_ip_subnet: 64,
}); });
async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage { async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
@@ -597,51 +548,6 @@ mod tests {
} }
} }
#[test]
fn masked_ip() {
let ip_a = IpAddr::V4([127, 0, 0, 1].into());
let ip_b = IpAddr::V4([127, 0, 0, 2].into());
let ip_c = IpAddr::V4([192, 168, 1, 101].into());
let ip_d = IpAddr::V4([192, 168, 1, 102].into());
let ip_e = IpAddr::V6("abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd".parse().unwrap());
let ip_f = IpAddr::V6("abcd:abcd:abcd:abcd:1234:abcd:abcd:abcd".parse().unwrap());
assert_ne!(MaskedIp::new(ip_a, 64), MaskedIp::new(ip_b, 64));
assert_ne!(MaskedIp::new(ip_a, 32), MaskedIp::new(ip_b, 32));
assert_eq!(MaskedIp::new(ip_a, 30), MaskedIp::new(ip_b, 30));
assert_eq!(MaskedIp::new(ip_c, 30), MaskedIp::new(ip_d, 30));
assert_ne!(MaskedIp::new(ip_e, 128), MaskedIp::new(ip_f, 128));
assert_eq!(MaskedIp::new(ip_e, 64), MaskedIp::new(ip_f, 64));
}
#[test]
fn test_default_auth_rate_limit_set() {
// these values used to exceed u32::MAX
assert_eq!(
RateBucketInfo::DEFAULT_AUTH_SET,
[
RateBucketInfo {
interval: Duration::from_secs(1),
max_rpi: 1000 * 4096,
},
RateBucketInfo {
interval: Duration::from_secs(60),
max_rpi: 600 * 4096 * 60,
},
RateBucketInfo {
interval: Duration::from_secs(600),
max_rpi: 300 * 4096 * 600,
}
]
);
for x in RateBucketInfo::DEFAULT_AUTH_SET {
let y = x.to_string().parse().unwrap();
assert_eq!(x, y);
}
}
#[tokio::test] #[tokio::test]
async fn auth_quirks_scram() { async fn auth_quirks_scram() {
let (mut client, server) = tokio::io::duplex(1024); let (mut client, server) = tokio::io::duplex(1024);

View File

@@ -4,7 +4,7 @@ use crate::{
auth::password_hack::parse_endpoint_param, auth::password_hack::parse_endpoint_param,
context::RequestMonitoring, context::RequestMonitoring,
error::{ReportableError, UserFacingError}, error::{ReportableError, UserFacingError},
metrics::{Metrics, SniKind}, metrics::NUM_CONNECTION_ACCEPTED_BY_SNI,
proxy::NeonOptions, proxy::NeonOptions,
serverless::SERVERLESS_DRIVER_SNI, serverless::SERVERLESS_DRIVER_SNI,
EndpointId, RoleName, EndpointId, RoleName,
@@ -144,22 +144,21 @@ impl ComputeUserInfoMaybeEndpoint {
ctx.set_endpoint_id(ep.clone()); ctx.set_endpoint_id(ep.clone());
} }
let metrics = Metrics::get();
info!(%user, "credentials"); info!(%user, "credentials");
if sni.is_some() { if sni.is_some() {
info!("Connection with sni"); info!("Connection with sni");
metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); NUM_CONNECTION_ACCEPTED_BY_SNI
.with_label_values(&["sni"])
.inc();
} else if endpoint.is_some() { } else if endpoint.is_some() {
metrics NUM_CONNECTION_ACCEPTED_BY_SNI
.proxy .with_label_values(&["no_sni"])
.accepted_connections_by_sni .inc();
.inc(SniKind::NoSni);
info!("Connection without sni"); info!("Connection without sni");
} else { } else {
metrics NUM_CONNECTION_ACCEPTED_BY_SNI
.proxy .with_label_values(&["password_hack"])
.accepted_connections_by_sni .inc();
.inc(SniKind::PasswordHack);
info!("Connection with password hack"); info!("Connection with password hack");
} }

View File

@@ -9,13 +9,15 @@ use futures::future::Either;
use itertools::Itertools; use itertools::Itertools;
use proxy::config::TlsServerEndPoint; use proxy::config::TlsServerEndPoint;
use proxy::context::RequestMonitoring; use proxy::context::RequestMonitoring;
use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled}; use proxy::proxy::run_until_cancelled;
use proxy::{BranchId, EndpointId, ProjectId};
use rustls::pki_types::PrivateKeyDer; use rustls::pki_types::PrivateKeyDer;
use tokio::net::TcpListener; use tokio::net::TcpListener;
use anyhow::{anyhow, bail, ensure, Context}; use anyhow::{anyhow, bail, ensure, Context};
use clap::Arg; use clap::Arg;
use futures::TryFutureExt; use futures::TryFutureExt;
use proxy::console::messages::MetricsAuxInfo;
use proxy::stream::{PqStream, Stream}; use proxy::stream::{PqStream, Stream};
use tokio::io::{AsyncRead, AsyncWrite}; use tokio::io::{AsyncRead, AsyncWrite};
@@ -174,12 +176,7 @@ async fn task_main(
.context("failed to set socket option")?; .context("failed to set socket option")?;
info!(%peer_addr, "serving"); info!(%peer_addr, "serving");
let ctx = RequestMonitoring::new( let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
session_id,
peer_addr.ip(),
proxy::metrics::Protocol::SniRouter,
"sni",
);
handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
} }
.unwrap_or_else(|e| { .unwrap_or_else(|e| {
@@ -202,7 +199,6 @@ async fn task_main(
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>( async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
ctx: &mut RequestMonitoring,
raw_stream: S, raw_stream: S,
tls_config: Arc<rustls::ServerConfig>, tls_config: Arc<rustls::ServerConfig>,
tls_server_end_point: TlsServerEndPoint, tls_server_end_point: TlsServerEndPoint,
@@ -232,10 +228,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
} }
Ok(Stream::Tls { Ok(Stream::Tls {
tls: Box::new( tls: Box::new(raw.upgrade(tls_config).await?),
raw.upgrade(tls_config, !ctx.has_private_peer_addr())
.await?,
),
tls_server_end_point, tls_server_end_point,
}) })
} }
@@ -258,7 +251,7 @@ async fn handle_client(
tls_server_end_point: TlsServerEndPoint, tls_server_end_point: TlsServerEndPoint,
stream: impl AsyncRead + AsyncWrite + Unpin, stream: impl AsyncRead + AsyncWrite + Unpin,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?; let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?;
// Cut off first part of the SNI domain // Cut off first part of the SNI domain
// We receive required destination details in the format of // We receive required destination details in the format of
@@ -275,15 +268,18 @@ async fn handle_client(
info!("destination: {}", destination); info!("destination: {}", destination);
let mut client = tokio::net::TcpStream::connect(destination).await?; let client = tokio::net::TcpStream::connect(destination).await?;
let metrics_aux: MetricsAuxInfo = MetricsAuxInfo {
endpoint_id: (&EndpointId::from("")).into(),
project_id: (&ProjectId::from("")).into(),
branch_id: (&BranchId::from("")).into(),
cold_start_info: proxy::console::messages::ColdStartInfo::Unknown,
};
// doesn't yet matter as pg-sni-router doesn't report analytics logs // doesn't yet matter as pg-sni-router doesn't report analytics logs
ctx.set_success(); ctx.set_success();
ctx.log(); ctx.log();
// Starting from here we only proxy the client's traffic. proxy::proxy::passthrough::proxy_pass(tls_stream, client, metrics_aux).await
info!("performing the proxy pass...");
let _ = copy_bidirectional_client_compute(&mut tls_stream, &mut client).await?;
Ok(())
} }

View File

@@ -7,7 +7,6 @@ use aws_config::provider_config::ProviderConfig;
use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
use futures::future::Either; use futures::future::Either;
use proxy::auth; use proxy::auth;
use proxy::auth::backend::AuthRateLimiter;
use proxy::auth::backend::MaybeOwned; use proxy::auth::backend::MaybeOwned;
use proxy::cancellation::CancelMap; use proxy::cancellation::CancelMap;
use proxy::cancellation::CancellationHandler; use proxy::cancellation::CancellationHandler;
@@ -19,10 +18,11 @@ use proxy::config::ProjectInfoCacheOptions;
use proxy::console; use proxy::console;
use proxy::context::parquet::ParquetUploadArgs; use proxy::context::parquet::ParquetUploadArgs;
use proxy::http; use proxy::http;
use proxy::http::health_server::AppMetrics; use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
use proxy::metrics::Metrics; use proxy::rate_limiter::AuthRateLimiter;
use proxy::rate_limiter::EndpointRateLimiter; use proxy::rate_limiter::EndpointRateLimiter;
use proxy::rate_limiter::RateBucketInfo; use proxy::rate_limiter::RateBucketInfo;
use proxy::rate_limiter::RateLimiterConfig;
use proxy::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::cancellation_publisher::RedisPublisherClient;
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
use proxy::redis::elasticache; use proxy::redis::elasticache;
@@ -42,7 +42,6 @@ use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use tracing::info; use tracing::info;
use tracing::warn; use tracing::warn;
use tracing::Instrument;
use utils::{project_build_tag, project_git_version, sentry_init::init_sentry}; use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
project_git_version!(GIT_VERSION); project_git_version!(GIT_VERSION);
@@ -132,8 +131,14 @@ struct ProxyCliArgs {
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
require_client_ip: bool, require_client_ip: bool,
/// Disable dynamic rate limiter and store the metrics to ensure its production behaviour. /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
disable_dynamic_rate_limiter: bool, disable_dynamic_rate_limiter: bool,
/// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
#[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
rate_limit_algorithm: proxy::rate_limiter::RateLimitAlgorithm,
/// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error.
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
rate_limiter_timeout: tokio::time::Duration,
/// Endpoint rate limiter max number of requests per second. /// Endpoint rate limiter max number of requests per second.
/// ///
/// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'. /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
@@ -146,12 +151,14 @@ struct ProxyCliArgs {
/// Authentication rate limiter max number of hashes per second. /// Authentication rate limiter max number of hashes per second.
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
auth_rate_limit: Vec<RateBucketInfo>, auth_rate_limit: Vec<RateBucketInfo>,
/// The IP subnet to use when considering whether two IP addresses are considered the same.
#[clap(long, default_value_t = 64)]
auth_rate_limit_ip_subnet: u8,
/// Redis rate limiter max number of requests per second. /// Redis rate limiter max number of requests per second.
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
redis_rps_limit: Vec<RateBucketInfo>, redis_rps_limit: Vec<RateBucketInfo>,
/// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
#[clap(long, default_value_t = 100)]
initial_limit: usize,
#[clap(flatten)]
aimd_config: proxy::rate_limiter::AimdConfig,
/// cache for `allowed_ips` (use `size=0` to disable) /// cache for `allowed_ips` (use `size=0` to disable)
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
allowed_ips_cache: String, allowed_ips_cache: String,
@@ -182,9 +189,7 @@ struct ProxyCliArgs {
/// cache for `project_info` (use `size=0` to disable) /// cache for `project_info` (use `size=0` to disable)
#[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
project_info_cache: String, project_info_cache: String,
/// cache for all valid endpoints
#[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
endpoint_cache_config: String,
#[clap(flatten)] #[clap(flatten)]
parquet_upload: ParquetUploadArgs, parquet_upload: ParquetUploadArgs,
@@ -244,18 +249,14 @@ async fn main() -> anyhow::Result<()> {
info!("Version: {GIT_VERSION}"); info!("Version: {GIT_VERSION}");
info!("Build_tag: {BUILD_TAG}"); info!("Build_tag: {BUILD_TAG}");
let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
revision: GIT_VERSION,
build_tag: BUILD_TAG,
});
let jemalloc = match proxy::jemalloc::MetricRecorder::new() { match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) {
Ok(t) => Some(t), Ok(t) => {
Err(e) => { t.start();
tracing::error!(error = ?e, "could not start jemalloc metrics loop");
None
} }
}; Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"),
}
let args = ProxyCliArgs::parse(); let args = ProxyCliArgs::parse();
let config = build_config(&args)?; let config = build_config(&args)?;
@@ -295,27 +296,27 @@ async fn main() -> anyhow::Result<()> {
), ),
aws_credentials_provider, aws_credentials_provider,
)); ));
let regional_redis_client = match (args.redis_host, args.redis_port) { let redis_notifications_client =
(Some(host), Some(port)) => Some( match (args.redis_notifications, (args.redis_host, args.redis_port)) {
ConnectionWithCredentialsProvider::new_with_credentials_provider( (Some(url), _) => {
host, info!("Starting redis notifications listener ({url})");
port, Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
elasticache_credentials_provider.clone(), }
(None, (Some(host), Some(port))) => Some(
ConnectionWithCredentialsProvider::new_with_credentials_provider(
host,
port,
elasticache_credentials_provider.clone(),
),
), ),
), (None, (None, None)) => {
(None, None) => { warn!("Redis is disabled");
warn!("Redis events from console are disabled"); None
None }
} _ => {
_ => { bail!("redis-host and redis-port must be specified together");
bail!("redis-host and redis-port must be specified together"); }
} };
};
let redis_notifications_client = if let Some(url) = args.redis_notifications {
Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
} else {
regional_redis_client.clone()
};
// Check that we can bind to address before further initialization // Check that we can bind to address before further initialization
let http_address: SocketAddr = args.http.parse()?; let http_address: SocketAddr = args.http.parse()?;
@@ -331,9 +332,11 @@ async fn main() -> anyhow::Result<()> {
let proxy_listener = TcpListener::bind(proxy_address).await?; let proxy_listener = TcpListener::bind(proxy_address).await?;
let cancellation_token = CancellationToken::new(); let cancellation_token = CancellationToken::new();
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
let cancel_map = CancelMap::default(); let cancel_map = CancelMap::default();
let redis_publisher = match &regional_redis_client { // let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x)));
let redis_publisher = match &redis_notifications_client {
Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
redis_publisher.clone(), redis_publisher.clone(),
args.region.clone(), args.region.clone(),
@@ -346,7 +349,7 @@ async fn main() -> anyhow::Result<()> {
>::new( >::new(
cancel_map.clone(), cancel_map.clone(),
redis_publisher, redis_publisher,
proxy::metrics::CancellationSource::FromClient, NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT,
)); ));
// client facing tasks. these will exit on error or on cancellation // client facing tasks. these will exit on error or on cancellation
@@ -356,6 +359,7 @@ async fn main() -> anyhow::Result<()> {
config, config,
proxy_listener, proxy_listener,
cancellation_token.clone(), cancellation_token.clone(),
endpoint_rate_limiter.clone(),
cancellation_handler.clone(), cancellation_handler.clone(),
)); ));
@@ -370,6 +374,7 @@ async fn main() -> anyhow::Result<()> {
config, config,
serverless_listener, serverless_listener,
cancellation_token.clone(), cancellation_token.clone(),
endpoint_rate_limiter.clone(),
cancellation_handler.clone(), cancellation_handler.clone(),
)); ));
} }
@@ -382,14 +387,7 @@ async fn main() -> anyhow::Result<()> {
// maintenance tasks. these never return unless there's an error // maintenance tasks. these never return unless there's an error
let mut maintenance_tasks = JoinSet::new(); let mut maintenance_tasks = JoinSet::new();
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone())); maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
maintenance_tasks.spawn(http::health_server::task_main( maintenance_tasks.spawn(http::health_server::task_main(http_listener));
http_listener,
AppMetrics {
jemalloc,
neon_metrics,
proxy: proxy::metrics::Metrics::get(),
},
));
maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener)); maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
if let Some(metrics_config) = &config.metric_collection { if let Some(metrics_config) = &config.metric_collection {
@@ -406,19 +404,13 @@ async fn main() -> anyhow::Result<()> {
if let Some(redis_notifications_client) = redis_notifications_client { if let Some(redis_notifications_client) = redis_notifications_client {
let cache = api.caches.project_info.clone(); let cache = api.caches.project_info.clone();
maintenance_tasks.spawn(notifications::task_main( maintenance_tasks.spawn(notifications::task_main(
redis_notifications_client, redis_notifications_client.clone(),
cache.clone(), cache.clone(),
cancel_map.clone(), cancel_map.clone(),
args.region.clone(), args.region.clone(),
)); ));
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
} }
if let Some(regional_redis_client) = regional_redis_client {
let cache = api.caches.endpoints_cache.clone();
let con = regional_redis_client;
let span = tracing::info_span!("endpoints_cache");
maintenance_tasks.spawn(async move { cache.do_read(con).await }.instrument(span));
}
} }
} }
@@ -484,27 +476,27 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
and metric-collection-interval must be specified" and metric-collection-interval must be specified"
), ),
}; };
if !args.disable_dynamic_rate_limiter { let rate_limiter_config = RateLimiterConfig {
bail!("dynamic rate limiter should be disabled"); disable: args.disable_dynamic_rate_limiter,
} algorithm: args.rate_limit_algorithm,
timeout: args.rate_limiter_timeout,
initial_limit: args.initial_limit,
aimd_config: Some(args.aimd_config),
};
let auth_backend = match &args.auth_backend { let auth_backend = match &args.auth_backend {
AuthBackend::Console => { AuthBackend::Console => {
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
let project_info_cache_config: ProjectInfoCacheOptions = let project_info_cache_config: ProjectInfoCacheOptions =
args.project_info_cache.parse()?; args.project_info_cache.parse()?;
let endpoint_cache_config: config::EndpointCacheConfig =
args.endpoint_cache_config.parse()?;
info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
info!( info!(
"Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
); );
info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
let caches = Box::leak(Box::new(console::caches::ApiCaches::new( let caches = Box::leak(Box::new(console::caches::ApiCaches::new(
wake_compute_cache_config, wake_compute_cache_config,
project_info_cache_config, project_info_cache_config,
endpoint_cache_config,
))); )));
let config::WakeComputeLockOptions { let config::WakeComputeLockOptions {
@@ -515,26 +507,15 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
} = args.wake_compute_lock.parse()?; } = args.wake_compute_lock.parse()?;
info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)"); info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
let locks = Box::leak(Box::new( let locks = Box::leak(Box::new(
console::locks::ApiLocks::new( console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
"wake_compute_lock", .unwrap(),
permits,
shards,
timeout,
epoch,
&Metrics::get().wake_compute_lock,
)
.unwrap(),
)); ));
tokio::spawn(locks.garbage_collect_worker()); tokio::spawn(locks.garbage_collect_worker(epoch));
let url = args.auth_endpoint.parse()?; let url = args.auth_endpoint.parse()?;
let endpoint = http::Endpoint::new(url, http::new_client()); let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); let api = console::provider::neon::Api::new(endpoint, caches, locks);
RateBucketInfo::validate(&mut endpoint_rps_limit)?;
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
let api =
console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter);
let api = console::provider::ConsoleBackend::Console(api); let api = console::provider::ConsoleBackend::Console(api);
auth::BackendType::Console(MaybeOwned::Owned(api), ()) auth::BackendType::Console(MaybeOwned::Owned(api), ())
} }
@@ -565,9 +546,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
scram_protocol_timeout: args.scram_protocol_timeout, scram_protocol_timeout: args.scram_protocol_timeout,
rate_limiter_enabled: args.auth_rate_limit_enabled, rate_limiter_enabled: args.auth_rate_limit_enabled,
rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
}; };
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
RateBucketInfo::validate(&mut endpoint_rps_limit)?;
let mut redis_rps_limit = args.redis_rps_limit.clone(); let mut redis_rps_limit = args.redis_rps_limit.clone();
RateBucketInfo::validate(&mut redis_rps_limit)?; RateBucketInfo::validate(&mut redis_rps_limit)?;
@@ -580,6 +562,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
authentication_config, authentication_config,
require_client_ip: args.require_client_ip, require_client_ip: args.require_client_ip,
disable_ip_check_for_http: args.disable_ip_check_for_http, disable_ip_check_for_http: args.disable_ip_check_for_http,
endpoint_rps_limit,
redis_rps_limit, redis_rps_limit,
handshake_timeout: args.handshake_timeout, handshake_timeout: args.handshake_timeout,
region: args.region.clone(), region: args.region.clone(),

View File

@@ -1,5 +1,4 @@
pub mod common; pub mod common;
pub mod endpoints;
pub mod project_info; pub mod project_info;
mod timed_lru; mod timed_lru;

View File

@@ -1,227 +0,0 @@
use std::{
convert::Infallible,
sync::{
atomic::{AtomicBool, Ordering},
Arc,
},
};
use dashmap::DashSet;
use redis::{
streams::{StreamReadOptions, StreamReadReply},
AsyncCommands, FromRedisValue, Value,
};
use serde::Deserialize;
use tokio::sync::Mutex;
use tracing::info;
use crate::{
config::EndpointCacheConfig,
context::RequestMonitoring,
intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
metrics::{Metrics, RedisErrors},
rate_limiter::GlobalRateLimiter,
redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
EndpointId,
};
#[derive(Deserialize, Debug, Clone)]
pub struct ControlPlaneEventKey {
endpoint_created: Option<EndpointCreated>,
branch_created: Option<BranchCreated>,
project_created: Option<ProjectCreated>,
}
#[derive(Deserialize, Debug, Clone)]
struct EndpointCreated {
endpoint_id: String,
}
#[derive(Deserialize, Debug, Clone)]
struct BranchCreated {
branch_id: String,
}
#[derive(Deserialize, Debug, Clone)]
struct ProjectCreated {
project_id: String,
}
pub struct EndpointsCache {
config: EndpointCacheConfig,
endpoints: DashSet<EndpointIdInt>,
branches: DashSet<BranchIdInt>,
projects: DashSet<ProjectIdInt>,
ready: AtomicBool,
limiter: Arc<Mutex<GlobalRateLimiter>>,
}
impl EndpointsCache {
pub fn new(config: EndpointCacheConfig) -> Self {
Self {
limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
config.limiter_info.clone(),
))),
config,
endpoints: DashSet::new(),
branches: DashSet::new(),
projects: DashSet::new(),
ready: AtomicBool::new(false),
}
}
pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
if !self.ready.load(Ordering::Acquire) {
return true;
}
let rejected = self.should_reject(endpoint);
ctx.set_rejected(rejected);
info!(?rejected, "check endpoint is valid, disabled cache");
// If cache is disabled, just collect the metrics and return or
// If the limiter allows, we don't need to check the cache.
if self.config.disable_cache || self.limiter.lock().await.check() {
return true;
}
!rejected
}
fn should_reject(&self, endpoint: &EndpointId) -> bool {
if endpoint.is_endpoint() {
!self.endpoints.contains(&EndpointIdInt::from(endpoint))
} else if endpoint.is_branch() {
!self
.branches
.contains(&BranchIdInt::from(&endpoint.as_branch()))
} else {
!self
.projects
.contains(&ProjectIdInt::from(&endpoint.as_project()))
}
}
fn insert_event(&self, key: ControlPlaneEventKey) {
// Do not do normalization here, we expect the events to be normalized.
if let Some(endpoint_created) = key.endpoint_created {
self.endpoints
.insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
}
if let Some(branch_created) = key.branch_created {
self.branches
.insert(BranchIdInt::from(&branch_created.branch_id.into()));
}
if let Some(project_created) = key.project_created {
self.projects
.insert(ProjectIdInt::from(&project_created.project_id.into()));
}
}
pub async fn do_read(
&self,
mut con: ConnectionWithCredentialsProvider,
) -> anyhow::Result<Infallible> {
let mut last_id = "0-0".to_string();
loop {
self.ready.store(false, Ordering::Release);
if let Err(e) = con.connect().await {
tracing::error!("error connecting to redis: {:?}", e);
continue;
}
if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
tracing::error!("error reading from redis: {:?}", e);
}
tokio::time::sleep(self.config.retry_interval).await;
}
}
async fn read_from_stream(
&self,
con: &mut ConnectionWithCredentialsProvider,
last_id: &mut String,
) -> anyhow::Result<()> {
tracing::info!("reading endpoints/branches/projects from redis");
self.batch_read(
con,
StreamReadOptions::default().count(self.config.initial_batch_size),
last_id,
true,
)
.await?;
tracing::info!("ready to filter user requests");
self.ready.store(true, Ordering::Release);
self.batch_read(
con,
StreamReadOptions::default()
.count(self.config.default_batch_size)
.block(self.config.xread_timeout.as_millis() as usize),
last_id,
false,
)
.await
}
fn parse_key_value(value: &Value) -> anyhow::Result<ControlPlaneEventKey> {
let s: String = FromRedisValue::from_redis_value(value)?;
Ok(serde_json::from_str(&s)?)
}
async fn batch_read(
&self,
conn: &mut ConnectionWithCredentialsProvider,
opts: StreamReadOptions,
last_id: &mut String,
return_when_finish: bool,
) -> anyhow::Result<()> {
let mut total: usize = 0;
loop {
let mut res: StreamReadReply = conn
.xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts)
.await?;
if res.keys.is_empty() {
if return_when_finish {
if total != 0 {
break;
}
anyhow::bail!(
"Redis stream {} is empty, cannot be used to filter endpoints",
self.config.stream_name
);
}
// If we are not returning when finish, we should wait for more data.
continue;
}
if res.keys.len() != 1 {
anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
}
let res = res.keys.pop().expect("Checked length above");
let len = res.ids.len();
for x in res.ids {
total += 1;
for (_, v) in x.map {
let key = match Self::parse_key_value(&v) {
Ok(x) => x,
Err(e) => {
Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
channel: &self.config.stream_name,
});
tracing::error!("error parsing value {v:?}: {e:?}");
continue;
}
};
self.insert_event(key);
}
if total.is_power_of_two() {
tracing::debug!("endpoints read {}", total);
}
*last_id = x.id;
}
if return_when_finish && len <= self.config.default_batch_size {
break;
}
}
tracing::info!("read {} endpoints/branches/projects from redis", total);
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::ControlPlaneEventKey;
#[test]
fn test() {
let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}";
let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap();
}
}

View File

@@ -10,7 +10,7 @@ use uuid::Uuid;
use crate::{ use crate::{
error::ReportableError, error::ReportableError,
metrics::{CancellationRequest, CancellationSource, Metrics}, metrics::NUM_CANCELLATION_REQUESTS,
redis::cancellation_publisher::{ redis::cancellation_publisher::{
CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
}, },
@@ -28,7 +28,7 @@ pub struct CancellationHandler<P> {
client: P, client: P,
/// This field used for the monitoring purposes. /// This field used for the monitoring purposes.
/// Represents the source of the cancellation request. /// Represents the source of the cancellation request.
from: CancellationSource, from: &'static str,
} }
#[derive(Debug, Error)] #[derive(Debug, Error)]
@@ -89,13 +89,9 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
// NB: we should immediately release the lock after cloning the token. // NB: we should immediately release the lock after cloning the token.
let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
tracing::warn!("query cancellation key not found: {key}"); tracing::warn!("query cancellation key not found: {key}");
Metrics::get() NUM_CANCELLATION_REQUESTS
.proxy .with_label_values(&[self.from, "not_found"])
.cancellation_requests_total .inc();
.inc(CancellationRequest {
source: self.from,
kind: crate::metrics::CancellationOutcome::NotFound,
});
match self.client.try_publish(key, session_id).await { match self.client.try_publish(key, session_id).await {
Ok(()) => {} // do nothing Ok(()) => {} // do nothing
Err(e) => { Err(e) => {
@@ -107,13 +103,9 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
} }
return Ok(()); return Ok(());
}; };
Metrics::get() NUM_CANCELLATION_REQUESTS
.proxy .with_label_values(&[self.from, "found"])
.cancellation_requests_total .inc();
.inc(CancellationRequest {
source: self.from,
kind: crate::metrics::CancellationOutcome::Found,
});
info!("cancelling query per user's request using key {key}"); info!("cancelling query per user's request using key {key}");
cancel_closure.try_cancel_query().await cancel_closure.try_cancel_query().await
} }
@@ -130,7 +122,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
} }
impl CancellationHandler<()> { impl CancellationHandler<()> {
pub fn new(map: CancelMap, from: CancellationSource) -> Self { pub fn new(map: CancelMap, from: &'static str) -> Self {
Self { Self {
map, map,
client: (), client: (),
@@ -140,7 +132,7 @@ impl CancellationHandler<()> {
} }
impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> { impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: CancellationSource) -> Self { pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: &'static str) -> Self {
Self { map, client, from } Self { map, client, from }
} }
} }
@@ -200,13 +192,15 @@ impl<P> Drop for Session<P> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS;
use super::*; use super::*;
#[tokio::test] #[tokio::test]
async fn check_session_drop() -> anyhow::Result<()> { async fn check_session_drop() -> anyhow::Result<()> {
let cancellation_handler = Arc::new(CancellationHandler::<()>::new( let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
CancelMap::default(), CancelMap::default(),
CancellationSource::FromRedis, NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
)); ));
let session = cancellation_handler.clone().get_session(); let session = cancellation_handler.clone().get_session();
@@ -220,7 +214,7 @@ mod tests {
#[tokio::test] #[tokio::test]
async fn cancel_session_noop_regression() { async fn cancel_session_noop_regression() {
let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local); let handler = CancellationHandler::<()>::new(Default::default(), "local");
handler handler
.cancel_session( .cancel_session(
CancelKeyData { CancelKeyData {

View File

@@ -4,11 +4,12 @@ use crate::{
console::{errors::WakeComputeError, messages::MetricsAuxInfo}, console::{errors::WakeComputeError, messages::MetricsAuxInfo},
context::RequestMonitoring, context::RequestMonitoring,
error::{ReportableError, UserFacingError}, error::{ReportableError, UserFacingError},
metrics::{Metrics, NumDbConnectionsGuard}, metrics::NUM_DB_CONNECTIONS_GAUGE,
proxy::neon_option, proxy::neon_option,
}; };
use futures::{FutureExt, TryFutureExt}; use futures::{FutureExt, TryFutureExt};
use itertools::Itertools; use itertools::Itertools;
use metrics::IntCounterPairGuard;
use pq_proto::StartupMessageParams; use pq_proto::StartupMessageParams;
use std::{io, net::SocketAddr, time::Duration}; use std::{io, net::SocketAddr, time::Duration};
use thiserror::Error; use thiserror::Error;
@@ -248,7 +249,7 @@ pub struct PostgresConnection {
/// Labels for proxy's metrics. /// Labels for proxy's metrics.
pub aux: MetricsAuxInfo, pub aux: MetricsAuxInfo,
_guage: NumDbConnectionsGuard<'static>, _guage: IntCounterPairGuard,
} }
impl ConnCfg { impl ConnCfg {
@@ -294,7 +295,9 @@ impl ConnCfg {
params, params,
cancel_closure, cancel_closure,
aux, aux,
_guage: Metrics::get().proxy.db_connections.guard(ctx.protocol), _guage: NUM_DB_CONNECTIONS_GAUGE
.with_label_values(&[ctx.protocol])
.guard(),
}; };
Ok(connection) Ok(connection)

View File

@@ -1,6 +1,6 @@
use crate::{ use crate::{
auth::{self, backend::AuthRateLimiter}, auth,
rate_limiter::RateBucketInfo, rate_limiter::{AuthRateLimiter, RateBucketInfo},
serverless::GlobalConnPoolOptions, serverless::GlobalConnPoolOptions,
}; };
use anyhow::{bail, ensure, Context, Ok}; use anyhow::{bail, ensure, Context, Ok};
@@ -29,6 +29,7 @@ pub struct ProxyConfig {
pub authentication_config: AuthenticationConfig, pub authentication_config: AuthenticationConfig,
pub require_client_ip: bool, pub require_client_ip: bool,
pub disable_ip_check_for_http: bool, pub disable_ip_check_for_http: bool,
pub endpoint_rps_limit: Vec<RateBucketInfo>,
pub redis_rps_limit: Vec<RateBucketInfo>, pub redis_rps_limit: Vec<RateBucketInfo>,
pub region: String, pub region: String,
pub handshake_timeout: Duration, pub handshake_timeout: Duration,
@@ -57,7 +58,6 @@ pub struct AuthenticationConfig {
pub scram_protocol_timeout: tokio::time::Duration, pub scram_protocol_timeout: tokio::time::Duration,
pub rate_limiter_enabled: bool, pub rate_limiter_enabled: bool,
pub rate_limiter: AuthRateLimiter, pub rate_limiter: AuthRateLimiter,
pub rate_limit_ip_subnet: u8,
} }
impl TlsConfig { impl TlsConfig {
@@ -313,80 +313,6 @@ impl CertResolver {
} }
} }
#[derive(Debug)]
pub struct EndpointCacheConfig {
/// Batch size to receive all endpoints on the startup.
pub initial_batch_size: usize,
/// Batch size to receive endpoints.
pub default_batch_size: usize,
/// Timeouts for the stream read operation.
pub xread_timeout: Duration,
/// Stream name to read from.
pub stream_name: String,
/// Limiter info (to distinguish when to enable cache).
pub limiter_info: Vec<RateBucketInfo>,
/// Disable cache.
/// If true, cache is ignored, but reports all statistics.
pub disable_cache: bool,
/// Retry interval for the stream read operation.
pub retry_interval: Duration,
}
impl EndpointCacheConfig {
/// Default options for [`crate::console::provider::NodeInfoCache`].
/// Notice that by default the limiter is empty, which means that cache is disabled.
pub const CACHE_DEFAULT_OPTIONS: &'static str =
"initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s";
/// Parse cache options passed via cmdline.
/// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
fn parse(options: &str) -> anyhow::Result<Self> {
let mut initial_batch_size = None;
let mut default_batch_size = None;
let mut xread_timeout = None;
let mut stream_name = None;
let mut limiter_info = vec![];
let mut disable_cache = false;
let mut retry_interval = None;
for option in options.split(',') {
let (key, value) = option
.split_once('=')
.with_context(|| format!("bad key-value pair: {option}"))?;
match key {
"initial_batch_size" => initial_batch_size = Some(value.parse()?),
"default_batch_size" => default_batch_size = Some(value.parse()?),
"xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?),
"stream_name" => stream_name = Some(value.to_string()),
"limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?),
"disable_cache" => disable_cache = value.parse()?,
"retry_interval" => retry_interval = Some(humantime::parse_duration(value)?),
unknown => bail!("unknown key: {unknown}"),
}
}
RateBucketInfo::validate(&mut limiter_info)?;
Ok(Self {
initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?,
default_batch_size: default_batch_size.context("missing `default_batch_size`")?,
xread_timeout: xread_timeout.context("missing `xread_timeout`")?,
stream_name: stream_name.context("missing `stream_name`")?,
disable_cache,
limiter_info,
retry_interval: retry_interval.context("missing `retry_interval`")?,
})
}
}
impl FromStr for EndpointCacheConfig {
type Err = anyhow::Error;
fn from_str(options: &str) -> Result<Self, Self::Err> {
let error = || format!("failed to parse endpoint cache options '{options}'");
Self::parse(options).with_context(error)
}
}
#[derive(Debug)] #[derive(Debug)]
pub struct MetricBackupCollectionConfig { pub struct MetricBackupCollectionConfig {
pub interval: Duration, pub interval: Duration,

View File

@@ -1,4 +1,3 @@
use measured::FixedCardinalityLabel;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::fmt; use std::fmt;
@@ -103,7 +102,7 @@ pub struct MetricsAuxInfo {
pub cold_start_info: ColdStartInfo, pub cold_start_info: ColdStartInfo,
} }
#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)] #[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)]
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
pub enum ColdStartInfo { pub enum ColdStartInfo {
#[default] #[default]
@@ -111,11 +110,9 @@ pub enum ColdStartInfo {
/// Compute was already running /// Compute was already running
Warm, Warm,
#[serde(rename = "pool_hit")] #[serde(rename = "pool_hit")]
#[label(rename = "pool_hit")]
/// Compute was not running but there was an available VM /// Compute was not running but there was an available VM
VmPoolHit, VmPoolHit,
#[serde(rename = "pool_miss")] #[serde(rename = "pool_miss")]
#[label(rename = "pool_miss")]
/// Compute was not running and there were no VMs available /// Compute was not running and there were no VMs available
VmPoolMiss, VmPoolMiss,

View File

@@ -8,12 +8,11 @@ use crate::{
backend::{ComputeCredentialKeys, ComputeUserInfo}, backend::{ComputeCredentialKeys, ComputeUserInfo},
IpPattern, IpPattern,
}, },
cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
compute, compute,
config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, config::{CacheOptions, ProjectInfoCacheOptions},
context::RequestMonitoring, context::RequestMonitoring,
intern::ProjectIdInt, intern::ProjectIdInt,
metrics::ApiLockMetrics,
scram, EndpointCacheKey, scram, EndpointCacheKey,
}; };
use dashmap::DashMap; use dashmap::DashMap;
@@ -208,9 +207,6 @@ pub mod errors {
#[error(transparent)] #[error(transparent)]
ApiError(ApiError), ApiError(ApiError),
#[error("Too many connections attempts")]
TooManyConnections,
#[error("Timeout waiting to acquire wake compute lock")] #[error("Timeout waiting to acquire wake compute lock")]
TimeoutError, TimeoutError,
} }
@@ -243,8 +239,6 @@ pub mod errors {
// However, API might return a meaningful error. // However, API might return a meaningful error.
ApiError(e) => e.to_string_client(), ApiError(e) => e.to_string_client(),
TooManyConnections => self.to_string(),
TimeoutError => "timeout while acquiring the compute resource lock".to_owned(), TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
} }
} }
@@ -255,7 +249,6 @@ pub mod errors {
match self { match self {
WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane, WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
WakeComputeError::ApiError(e) => e.get_error_kind(), WakeComputeError::ApiError(e) => e.get_error_kind(),
WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit, WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
} }
} }
@@ -423,15 +416,12 @@ pub struct ApiCaches {
pub node_info: NodeInfoCache, pub node_info: NodeInfoCache,
/// Cache which stores project_id -> endpoint_ids mapping. /// Cache which stores project_id -> endpoint_ids mapping.
pub project_info: Arc<ProjectInfoCacheImpl>, pub project_info: Arc<ProjectInfoCacheImpl>,
/// List of all valid endpoints.
pub endpoints_cache: Arc<EndpointsCache>,
} }
impl ApiCaches { impl ApiCaches {
pub fn new( pub fn new(
wake_compute_cache_config: CacheOptions, wake_compute_cache_config: CacheOptions,
project_info_cache_config: ProjectInfoCacheOptions, project_info_cache_config: ProjectInfoCacheOptions,
endpoint_cache_config: EndpointCacheConfig,
) -> Self { ) -> Self {
Self { Self {
node_info: NodeInfoCache::new( node_info: NodeInfoCache::new(
@@ -441,7 +431,6 @@ impl ApiCaches {
true, true,
), ),
project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)), project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
} }
} }
} }
@@ -452,8 +441,10 @@ pub struct ApiLocks {
node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>, node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
permits: usize, permits: usize,
timeout: Duration, timeout: Duration,
epoch: std::time::Duration, registered: prometheus::IntCounter,
metrics: &'static ApiLockMetrics, unregistered: prometheus::IntCounter,
reclamation_lag: prometheus::Histogram,
lock_acquire_lag: prometheus::Histogram,
} }
impl ApiLocks { impl ApiLocks {
@@ -462,16 +453,54 @@ impl ApiLocks {
permits: usize, permits: usize,
shards: usize, shards: usize,
timeout: Duration, timeout: Duration,
epoch: std::time::Duration,
metrics: &'static ApiLockMetrics,
) -> prometheus::Result<Self> { ) -> prometheus::Result<Self> {
let registered = prometheus::IntCounter::with_opts(
prometheus::Opts::new(
"semaphores_registered",
"Number of semaphores registered in this api lock",
)
.namespace(name),
)?;
prometheus::register(Box::new(registered.clone()))?;
let unregistered = prometheus::IntCounter::with_opts(
prometheus::Opts::new(
"semaphores_unregistered",
"Number of semaphores unregistered in this api lock",
)
.namespace(name),
)?;
prometheus::register(Box::new(unregistered.clone()))?;
let reclamation_lag = prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"reclamation_lag_seconds",
"Time it takes to reclaim unused semaphores in the api lock",
)
.namespace(name)
// 1us -> 65ms
// benchmarks on my mac indicate it's usually in the range of 256us and 512us
.buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?),
)?;
prometheus::register(Box::new(reclamation_lag.clone()))?;
let lock_acquire_lag = prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"semaphore_acquire_seconds",
"Time it takes to reclaim unused semaphores in the api lock",
)
.namespace(name)
// 0.1ms -> 6s
.buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?),
)?;
prometheus::register(Box::new(lock_acquire_lag.clone()))?;
Ok(Self { Ok(Self {
name, name,
node_locks: DashMap::with_shard_amount(shards), node_locks: DashMap::with_shard_amount(shards),
permits, permits,
timeout, timeout,
epoch, lock_acquire_lag,
metrics, registered,
unregistered,
reclamation_lag,
}) })
} }
@@ -491,7 +520,7 @@ impl ApiLocks {
self.node_locks self.node_locks
.entry(key.clone()) .entry(key.clone())
.or_insert_with(|| { .or_insert_with(|| {
self.metrics.semaphores_registered.inc(); self.registered.inc();
Arc::new(Semaphore::new(self.permits)) Arc::new(Semaphore::new(self.permits))
}) })
.clone() .clone()
@@ -499,21 +528,20 @@ impl ApiLocks {
}; };
let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await; let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
self.metrics self.lock_acquire_lag
.semaphore_acquire_seconds .observe((Instant::now() - now).as_secs_f64());
.observe(now.elapsed().as_secs_f64());
Ok(WakeComputePermit { Ok(WakeComputePermit {
permit: Some(permit??), permit: Some(permit??),
}) })
} }
pub async fn garbage_collect_worker(&self) { pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
if self.permits == 0 { if self.permits == 0 {
return; return;
} }
let mut interval =
tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32); let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
loop { loop {
for (i, shard) in self.node_locks.shards().iter().enumerate() { for (i, shard) in self.node_locks.shards().iter().enumerate() {
interval.tick().await; interval.tick().await;
@@ -526,13 +554,13 @@ impl ApiLocks {
"performing epoch reclamation on api lock" "performing epoch reclamation on api lock"
); );
let mut lock = shard.write(); let mut lock = shard.write();
let timer = self.metrics.reclamation_lag_seconds.start_timer(); let timer = self.reclamation_lag.start_timer();
let count = lock let count = lock
.extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1) .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
.count(); .count();
drop(lock); drop(lock);
self.metrics.semaphores_unregistered.inc_by(count as u64); self.unregistered.inc_by(count as u64);
timer.observe(); timer.observe_duration()
} }
} }
} }

View File

@@ -7,15 +7,13 @@ use super::{
NodeInfo, NodeInfo,
}; };
use crate::{ use crate::{
auth::backend::ComputeUserInfo, auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
compute, };
console::messages::ColdStartInfo, use crate::{
http, cache::Cached,
metrics::{CacheOutcome, Metrics}, context::RequestMonitoring,
rate_limiter::EndpointRateLimiter, metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
scram, Normalize,
}; };
use crate::{cache::Cached, context::RequestMonitoring};
use futures::TryFutureExt; use futures::TryFutureExt;
use std::sync::Arc; use std::sync::Arc;
use tokio::time::Instant; use tokio::time::Instant;
@@ -25,8 +23,7 @@ use tracing::{error, info, info_span, warn, Instrument};
pub struct Api { pub struct Api {
endpoint: http::Endpoint, endpoint: http::Endpoint,
pub caches: &'static ApiCaches, pub caches: &'static ApiCaches,
pub locks: &'static ApiLocks, locks: &'static ApiLocks,
pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
jwt: String, jwt: String,
} }
@@ -36,7 +33,6 @@ impl Api {
endpoint: http::Endpoint, endpoint: http::Endpoint,
caches: &'static ApiCaches, caches: &'static ApiCaches,
locks: &'static ApiLocks, locks: &'static ApiLocks,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> Self { ) -> Self {
let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") { let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
Ok(v) => v, Ok(v) => v,
@@ -46,7 +42,6 @@ impl Api {
endpoint, endpoint,
caches, caches,
locks, locks,
endpoint_rate_limiter,
jwt, jwt,
} }
} }
@@ -60,15 +55,6 @@ impl Api {
ctx: &mut RequestMonitoring, ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<AuthInfo, GetAuthInfoError> { ) -> Result<AuthInfo, GetAuthInfoError> {
if !self
.caches
.endpoints_cache
.is_valid(ctx, &user_info.endpoint.normalize())
.await
{
info!("endpoint is not valid, skipping the request");
return Ok(AuthInfo::default());
}
let request_id = ctx.session_id.to_string(); let request_id = ctx.session_id.to_string();
let application_name = ctx.console_application_name(); let application_name = ctx.console_application_name();
async { async {
@@ -95,9 +81,7 @@ impl Api {
Ok(body) => body, Ok(body) => body,
// Error 404 is special: it's ok not to have a secret. // Error 404 is special: it's ok not to have a secret.
Err(e) => match e.http_status_code() { Err(e) => match e.http_status_code() {
Some(http::StatusCode::NOT_FOUND) => { Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()),
return Ok(AuthInfo::default());
}
_otherwise => return Err(e.into()), _otherwise => return Err(e.into()),
}, },
}; };
@@ -111,10 +95,7 @@ impl Api {
Some(secret) Some(secret)
}; };
let allowed_ips = body.allowed_ips.unwrap_or_default(); let allowed_ips = body.allowed_ips.unwrap_or_default();
Metrics::get() ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64);
.proxy
.allowed_ips_number
.observe(allowed_ips.len() as f64);
Ok(AuthInfo { Ok(AuthInfo {
secret, secret,
allowed_ips, allowed_ips,
@@ -193,27 +174,23 @@ impl super::Api for Api {
ctx: &mut RequestMonitoring, ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, GetAuthInfoError> { ) -> Result<CachedRoleSecret, GetAuthInfoError> {
let normalized_ep = &user_info.endpoint.normalize(); let ep = &user_info.endpoint;
let user = &user_info.user; let user = &user_info.user;
if let Some(role_secret) = self if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) {
.caches
.project_info
.get_role_secret(normalized_ep, user)
{
return Ok(role_secret); return Ok(role_secret);
} }
let auth_info = self.do_get_auth_info(ctx, user_info).await?; let auth_info = self.do_get_auth_info(ctx, user_info).await?;
if let Some(project_id) = auth_info.project_id { if let Some(project_id) = auth_info.project_id {
let normalized_ep_int = normalized_ep.into(); let ep_int = ep.into();
self.caches.project_info.insert_role_secret( self.caches.project_info.insert_role_secret(
project_id, project_id,
normalized_ep_int, ep_int,
user.into(), user.into(),
auth_info.secret.clone(), auth_info.secret.clone(),
); );
self.caches.project_info.insert_allowed_ips( self.caches.project_info.insert_allowed_ips(
project_id, project_id,
normalized_ep_int, ep_int,
Arc::new(auth_info.allowed_ips), Arc::new(auth_info.allowed_ips),
); );
ctx.set_project_id(project_id); ctx.set_project_id(project_id);
@@ -227,34 +204,30 @@ impl super::Api for Api {
ctx: &mut RequestMonitoring, ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> { ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
let normalized_ep = &user_info.endpoint.normalize(); let ep = &user_info.endpoint;
if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
Metrics::get() ALLOWED_IPS_BY_CACHE_OUTCOME
.proxy .with_label_values(&["hit"])
.allowed_ips_cache_misses .inc();
.inc(CacheOutcome::Hit);
return Ok((allowed_ips, None)); return Ok((allowed_ips, None));
} }
Metrics::get() ALLOWED_IPS_BY_CACHE_OUTCOME
.proxy .with_label_values(&["miss"])
.allowed_ips_cache_misses .inc();
.inc(CacheOutcome::Miss);
let auth_info = self.do_get_auth_info(ctx, user_info).await?; let auth_info = self.do_get_auth_info(ctx, user_info).await?;
let allowed_ips = Arc::new(auth_info.allowed_ips); let allowed_ips = Arc::new(auth_info.allowed_ips);
let user = &user_info.user; let user = &user_info.user;
if let Some(project_id) = auth_info.project_id { if let Some(project_id) = auth_info.project_id {
let normalized_ep_int = normalized_ep.into(); let ep_int = ep.into();
self.caches.project_info.insert_role_secret( self.caches.project_info.insert_role_secret(
project_id, project_id,
normalized_ep_int, ep_int,
user.into(), user.into(),
auth_info.secret.clone(), auth_info.secret.clone(),
); );
self.caches.project_info.insert_allowed_ips( self.caches
project_id, .project_info
normalized_ep_int, .insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
allowed_ips.clone(),
);
ctx.set_project_id(project_id); ctx.set_project_id(project_id);
} }
Ok(( Ok((
@@ -281,14 +254,6 @@ impl super::Api for Api {
return Ok(cached); return Ok(cached);
} }
// check rate limit
if !self
.endpoint_rate_limiter
.check(user_info.endpoint.normalize().into(), 1)
{
return Err(WakeComputeError::TooManyConnections);
}
let permit = self.locks.get_wake_compute_permit(&key).await?; let permit = self.locks.get_wake_compute_permit(&key).await?;
// after getting back a permit - it's possible the cache was filled // after getting back a permit - it's possible the cache was filled

View File

@@ -5,14 +5,14 @@ use once_cell::sync::OnceCell;
use smol_str::SmolStr; use smol_str::SmolStr;
use std::net::IpAddr; use std::net::IpAddr;
use tokio::sync::mpsc; use tokio::sync::mpsc;
use tracing::{field::display, info, info_span, Span}; use tracing::{field::display, info_span, Span};
use uuid::Uuid; use uuid::Uuid;
use crate::{ use crate::{
console::messages::{ColdStartInfo, MetricsAuxInfo}, console::messages::{ColdStartInfo, MetricsAuxInfo},
error::ErrorKind, error::ErrorKind,
intern::{BranchIdInt, ProjectIdInt}, intern::{BranchIdInt, ProjectIdInt},
metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol}, metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
DbName, EndpointId, RoleName, DbName, EndpointId, RoleName,
}; };
@@ -29,7 +29,7 @@ static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::ne
pub struct RequestMonitoring { pub struct RequestMonitoring {
pub peer_addr: IpAddr, pub peer_addr: IpAddr,
pub session_id: Uuid, pub session_id: Uuid,
pub protocol: Protocol, pub protocol: &'static str,
first_packet: chrono::DateTime<Utc>, first_packet: chrono::DateTime<Utc>,
region: &'static str, region: &'static str,
pub span: Span, pub span: Span,
@@ -50,8 +50,6 @@ pub struct RequestMonitoring {
// This sender is here to keep the request monitoring channel open while requests are taking place. // This sender is here to keep the request monitoring channel open while requests are taking place.
sender: Option<mpsc::UnboundedSender<RequestData>>, sender: Option<mpsc::UnboundedSender<RequestData>>,
pub latency_timer: LatencyTimer, pub latency_timer: LatencyTimer,
// Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
rejected: Option<bool>,
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
@@ -67,7 +65,7 @@ impl RequestMonitoring {
pub fn new( pub fn new(
session_id: Uuid, session_id: Uuid,
peer_addr: IpAddr, peer_addr: IpAddr,
protocol: Protocol, protocol: &'static str,
region: &'static str, region: &'static str,
) -> Self { ) -> Self {
let span = info_span!( let span = info_span!(
@@ -76,7 +74,6 @@ impl RequestMonitoring {
?session_id, ?session_id,
%peer_addr, %peer_addr,
ep = tracing::field::Empty, ep = tracing::field::Empty,
role = tracing::field::Empty,
); );
Self { Self {
@@ -96,7 +93,6 @@ impl RequestMonitoring {
error_kind: None, error_kind: None,
auth_method: None, auth_method: None,
success: false, success: false,
rejected: None,
cold_start_info: ColdStartInfo::Unknown, cold_start_info: ColdStartInfo::Unknown,
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -106,7 +102,7 @@ impl RequestMonitoring {
#[cfg(test)] #[cfg(test)]
pub fn test() -> Self { pub fn test() -> Self {
RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test") RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test")
} }
pub fn console_application_name(&self) -> String { pub fn console_application_name(&self) -> String {
@@ -117,10 +113,6 @@ impl RequestMonitoring {
) )
} }
pub fn set_rejected(&mut self, rejected: bool) {
self.rejected = Some(rejected);
}
pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
self.cold_start_info = info; self.cold_start_info = info;
self.latency_timer.cold_start_info(info); self.latency_timer.cold_start_info(info);
@@ -142,9 +134,9 @@ impl RequestMonitoring {
pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
if self.endpoint_id.is_none() { if self.endpoint_id.is_none() {
self.span.record("ep", display(&endpoint_id)); self.span.record("ep", display(&endpoint_id));
let metric = &Metrics::get().proxy.connecting_endpoints; crate::metrics::CONNECTING_ENDPOINTS
let label = metric.with_labels(self.protocol); .with_label_values(&[self.protocol])
metric.get_metric(label).measure(&endpoint_id); .measure(&endpoint_id);
self.endpoint_id = Some(endpoint_id); self.endpoint_id = Some(endpoint_id);
} }
} }
@@ -158,7 +150,6 @@ impl RequestMonitoring {
} }
pub fn set_user(&mut self, user: RoleName) { pub fn set_user(&mut self, user: RoleName) {
self.span.record("role", display(&user));
self.user = Some(user); self.user = Some(user);
} }
@@ -166,22 +157,14 @@ impl RequestMonitoring {
self.auth_method = Some(auth_method); self.auth_method = Some(auth_method);
} }
pub fn has_private_peer_addr(&self) -> bool {
match self.peer_addr {
IpAddr::V4(ip) => ip.is_private(),
_ => false,
}
}
pub fn set_error_kind(&mut self, kind: ErrorKind) { pub fn set_error_kind(&mut self, kind: ErrorKind) {
// Do not record errors from the private address to metrics. ERROR_BY_KIND
if !self.has_private_peer_addr() { .with_label_values(&[kind.to_metric_label()])
Metrics::get().proxy.errors_total.inc(kind); .inc();
}
if let Some(ep) = &self.endpoint_id { if let Some(ep) = &self.endpoint_id {
let metric = &Metrics::get().proxy.endpoints_affected_by_errors; ENDPOINT_ERRORS_BY_KIND
let label = metric.with_labels(kind); .with_label_values(&[kind.to_metric_label()])
metric.get_metric(label).measure(ep); .measure(ep);
} }
self.error_kind = Some(kind); self.error_kind = Some(kind);
} }
@@ -195,33 +178,6 @@ impl RequestMonitoring {
impl Drop for RequestMonitoring { impl Drop for RequestMonitoring {
fn drop(&mut self) { fn drop(&mut self) {
let outcome = if self.success {
ConnectOutcome::Success
} else {
ConnectOutcome::Failed
};
if let Some(rejected) = self.rejected {
let ep = self
.endpoint_id
.as_ref()
.map(|x| x.as_str())
.unwrap_or_default();
// This makes sense only if cache is disabled
info!(
?outcome,
?rejected,
?ep,
"check endpoint is valid with outcome"
);
Metrics::get()
.proxy
.invalid_endpoints_total
.inc(InvalidEndpointsGroup {
protocol: self.protocol,
rejected: rejected.into(),
outcome,
});
}
if let Some(tx) = self.sender.take() { if let Some(tx) = self.sender.take() {
let _: Result<(), _> = tx.send(RequestData::from(&*self)); let _: Result<(), _> = tx.send(RequestData::from(&*self));
} }

View File

@@ -111,7 +111,7 @@ impl From<&RequestMonitoring> for RequestData {
super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus", super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
super::AuthMethod::Cleartext => "cleartext", super::AuthMethod::Cleartext => "cleartext",
}), }),
protocol: value.protocol.as_str(), protocol: value.protocol,
region: value.region, region: value.region,
error: value.error_kind.as_ref().map(|e| e.to_metric_label()), error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
success: value.success, success: value.success,

View File

@@ -1,7 +1,5 @@
use std::{error::Error as StdError, fmt, io}; use std::{error::Error as StdError, fmt, io};
use measured::FixedCardinalityLabel;
/// Upcast (almost) any error into an opaque [`io::Error`]. /// Upcast (almost) any error into an opaque [`io::Error`].
pub fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error { pub fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
io::Error::new(io::ErrorKind::Other, e) io::Error::new(io::ErrorKind::Other, e)
@@ -31,29 +29,24 @@ pub trait UserFacingError: ReportableError {
} }
} }
#[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)] #[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[label(singleton = "type")]
pub enum ErrorKind { pub enum ErrorKind {
/// Wrong password, unknown endpoint, protocol violation, etc... /// Wrong password, unknown endpoint, protocol violation, etc...
User, User,
/// Network error between user and proxy. Not necessarily user error /// Network error between user and proxy. Not necessarily user error
#[label(rename = "clientdisconnect")]
ClientDisconnect, ClientDisconnect,
/// Proxy self-imposed user rate limits /// Proxy self-imposed user rate limits
#[label(rename = "ratelimit")]
RateLimit, RateLimit,
/// Proxy self-imposed service-wise rate limits /// Proxy self-imposed service-wise rate limits
#[label(rename = "serviceratelimit")]
ServiceRateLimit, ServiceRateLimit,
/// internal errors /// internal errors
Service, Service,
/// Error communicating with control plane /// Error communicating with control plane
#[label(rename = "controlplane")]
ControlPlane, ControlPlane,
/// Postgres error /// Postgres error

View File

@@ -13,16 +13,13 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use tokio::time::Instant; use tokio::time::Instant;
use tracing::trace; use tracing::trace;
use crate::{ use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
metrics::{ConsoleRequest, Metrics},
url::ApiUrl,
};
use reqwest_middleware::RequestBuilder; use reqwest_middleware::RequestBuilder;
/// This is the preferred way to create new http clients, /// This is the preferred way to create new http clients,
/// because it takes care of observability (OpenTelemetry). /// because it takes care of observability (OpenTelemetry).
/// We deliberately don't want to replace this with a public static. /// We deliberately don't want to replace this with a public static.
pub fn new_client() -> ClientWithMiddleware { pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware {
let client = reqwest::ClientBuilder::new() let client = reqwest::ClientBuilder::new()
.dns_resolver(Arc::new(GaiResolver::default())) .dns_resolver(Arc::new(GaiResolver::default()))
.connection_verbose(true) .connection_verbose(true)
@@ -31,6 +28,7 @@ pub fn new_client() -> ClientWithMiddleware {
reqwest_middleware::ClientBuilder::new(client) reqwest_middleware::ClientBuilder::new(client)
.with(reqwest_tracing::TracingMiddleware::default()) .with(reqwest_tracing::TracingMiddleware::default())
.with(rate_limiter::Limiter::new(rate_limiter_config))
.build() .build()
} }
@@ -92,14 +90,13 @@ impl Endpoint {
/// Execute a [request](reqwest::Request). /// Execute a [request](reqwest::Request).
pub async fn execute(&self, request: Request) -> Result<Response, Error> { pub async fn execute(&self, request: Request) -> Result<Response, Error> {
let _timer = Metrics::get() let path = request.url().path().to_string();
.proxy let start = Instant::now();
.console_request_latency let res = self.client.execute(request).await;
.start_timer(ConsoleRequest { CONSOLE_REQUEST_LATENCY
request: request.url().path(), .with_label_values(&[&path])
}); .observe(start.elapsed().as_secs_f64());
res
self.client.execute(request).await
} }
} }

View File

@@ -1,84 +1,30 @@
use anyhow::{anyhow, bail}; use anyhow::{anyhow, bail};
use camino::Utf8PathBuf; use hyper::{Body, Request, Response, StatusCode};
use camino_tempfile::Utf8TempDir; use std::{convert::Infallible, net::TcpListener};
use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode}; use tracing::info;
use measured::{text::BufferedTextEncoder, MetricGroup};
use metrics::NeonMetrics;
use once_cell::sync::Lazy;
use std::{
convert::Infallible,
ffi::CString,
net::TcpListener,
sync::{Arc, Mutex},
};
use tracing::{info, info_span, warn};
use utils::http::{ use utils::http::{
endpoint::{self, request_span}, endpoint::{self, prometheus_metrics_handler, request_span},
error::ApiError, error::ApiError,
json::json_response, json::json_response,
RouterBuilder, RouterService, RouterBuilder, RouterService,
}; };
use crate::jemalloc;
async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> { async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(StatusCode::OK, "") json_response(StatusCode::OK, "")
} }
async fn prof_dump(_: Request<Body>) -> Result<Response<Body>, ApiError> { fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
static PROF_MIB: Lazy<jemalloc::dump_mib> = endpoint::make_router()
Lazy::new(|| jemalloc::dump::mib().expect("could not create prof.dump MIB")); .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
static PROF_DIR: Lazy<Utf8TempDir> = .get("/v1/status", status_handler)
Lazy::new(|| camino_tempfile::tempdir().expect("could not create tempdir"));
static PROF_FILE: Lazy<Utf8PathBuf> = Lazy::new(|| PROF_DIR.path().join("prof.dump"));
static PROF_FILE0: Lazy<CString> = Lazy::new(|| CString::new(PROF_FILE.as_str()).unwrap());
static DUMP_LOCK: Mutex<()> = Mutex::new(());
tokio::task::spawn_blocking(|| {
let _guard = DUMP_LOCK.lock();
PROF_MIB
.write(&PROF_FILE0)
.expect("could not trigger prof.dump");
let prof_dump = std::fs::read_to_string(&*PROF_FILE).expect("could not open prof.dump");
Response::new(Body::from(prof_dump))
})
.await
.map_err(|e| ApiError::InternalServerError(e.into()))
} }
fn make_router(metrics: AppMetrics) -> RouterBuilder<hyper::Body, ApiError> { pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible> {
let state = Arc::new(Mutex::new(PrometheusHandler {
encoder: BufferedTextEncoder::new(),
metrics,
}));
let mut router = endpoint::make_router()
.get("/metrics", move |r| {
let state = state.clone();
request_span(r, move |b| prometheus_metrics_handler(b, state))
})
.get("/v1/status", status_handler);
let prof_enabled = jemalloc::prof::read().unwrap_or_default();
if prof_enabled {
warn!("activating jemalloc profiling");
jemalloc::active::write(true).unwrap();
router = router.get("/v1/jemalloc/prof.dump", prof_dump);
}
router
}
pub async fn task_main(
http_listener: TcpListener,
metrics: AppMetrics,
) -> anyhow::Result<Infallible> {
scopeguard::defer! { scopeguard::defer! {
info!("http has shut down"); info!("http has shut down");
} }
let service = || RouterService::new(make_router(metrics).build()?); let service = || RouterService::new(make_router().build()?);
hyper::Server::from_tcp(http_listener)? hyper::Server::from_tcp(http_listener)?
.serve(service().map_err(|e| anyhow!(e))?) .serve(service().map_err(|e| anyhow!(e))?)
@@ -86,57 +32,3 @@ pub async fn task_main(
bail!("hyper server without shutdown handling cannot shutdown successfully"); bail!("hyper server without shutdown handling cannot shutdown successfully");
} }
struct PrometheusHandler {
encoder: BufferedTextEncoder,
metrics: AppMetrics,
}
#[derive(MetricGroup)]
pub struct AppMetrics {
#[metric(namespace = "jemalloc")]
pub jemalloc: Option<jemalloc::MetricRecorder>,
#[metric(flatten)]
pub neon_metrics: NeonMetrics,
#[metric(flatten)]
pub proxy: &'static crate::metrics::Metrics,
}
async fn prometheus_metrics_handler(
_req: Request<Body>,
state: Arc<Mutex<PrometheusHandler>>,
) -> Result<Response<Body>, ApiError> {
let started_at = std::time::Instant::now();
let span = info_span!("blocking");
let body = tokio::task::spawn_blocking(move || {
let _span = span.entered();
let mut state = state.lock().unwrap();
let PrometheusHandler { encoder, metrics } = &mut *state;
metrics
.collect_group_into(&mut *encoder)
.unwrap_or_else(|infallible| match infallible {});
let body = encoder.finish();
tracing::info!(
bytes = body.len(),
elapsed_ms = started_at.elapsed().as_millis(),
"responded /metrics"
);
body
})
.await
.unwrap();
let response = Response::builder()
.status(200)
.header(CONTENT_TYPE, "text/plain; version=0.0.4")
.body(Body::from(body))
.unwrap();
Ok(response)
}

View File

@@ -160,11 +160,6 @@ impl From<&EndpointId> for EndpointIdInt {
EndpointIdTag::get_interner().get_or_intern(value) EndpointIdTag::get_interner().get_or_intern(value)
} }
} }
impl From<EndpointId> for EndpointIdInt {
fn from(value: EndpointId) -> Self {
EndpointIdTag::get_interner().get_or_intern(&value)
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub struct BranchIdTag; pub struct BranchIdTag;
@@ -180,11 +175,6 @@ impl From<&BranchId> for BranchIdInt {
BranchIdTag::get_interner().get_or_intern(value) BranchIdTag::get_interner().get_or_intern(value)
} }
} }
impl From<BranchId> for BranchIdInt {
fn from(value: BranchId) -> Self {
BranchIdTag::get_interner().get_or_intern(&value)
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub struct ProjectIdTag; pub struct ProjectIdTag;
@@ -200,11 +190,6 @@ impl From<&ProjectId> for ProjectIdInt {
ProjectIdTag::get_interner().get_or_intern(value) ProjectIdTag::get_interner().get_or_intern(value)
} }
} }
impl From<ProjectId> for ProjectIdInt {
fn from(value: ProjectId) -> Self {
ProjectIdTag::get_interner().get_or_intern(&value)
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {

View File

@@ -1,47 +1,27 @@
use std::{ffi::CStr, marker::PhantomData}; use std::time::Duration;
use measured::{ use metrics::IntGauge;
label::NoLabels, use prometheus::{register_int_gauge_with_registry, Registry};
metric::{ use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};
gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder,
MetricEncoding, MetricFamilyEncoding, MetricType,
},
text::TextEncoder,
LabelGroup, MetricGroup,
};
use tikv_jemalloc_ctl::{
config, epoch, epoch_mib, raw, stats, version, Access, AsName, MibStr, Name,
};
pub struct MetricRecorder { pub struct MetricRecorder {
epoch: epoch_mib, epoch: epoch_mib,
inner: Metrics, active: stats::active_mib,
} active_gauge: IntGauge,
allocated: stats::allocated_mib,
#[derive(MetricGroup)] allocated_gauge: IntGauge,
struct Metrics { mapped: stats::mapped_mib,
active_bytes: JemallocGaugeFamily<stats::active_mib>, mapped_gauge: IntGauge,
allocated_bytes: JemallocGaugeFamily<stats::allocated_mib>, metadata: stats::metadata_mib,
mapped_bytes: JemallocGaugeFamily<stats::mapped_mib>, metadata_gauge: IntGauge,
metadata_bytes: JemallocGaugeFamily<stats::metadata_mib>, resident: stats::resident_mib,
resident_bytes: JemallocGaugeFamily<stats::resident_mib>, resident_gauge: IntGauge,
retained_bytes: JemallocGaugeFamily<stats::retained_mib>, retained: stats::retained_mib,
} retained_gauge: IntGauge,
impl<Enc: Encoding> MetricGroup<Enc> for MetricRecorder
where
Metrics: MetricGroup<Enc>,
{
fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> {
if self.epoch.advance().is_ok() {
self.inner.collect_group_into(enc)?;
}
Ok(())
}
} }
impl MetricRecorder { impl MetricRecorder {
pub fn new() -> Result<Self, anyhow::Error> { pub fn new(registry: &Registry) -> Result<Self, anyhow::Error> {
tracing::info!( tracing::info!(
config = config::malloc_conf::read()?, config = config::malloc_conf::read()?,
version = version::read()?, version = version::read()?,
@@ -50,125 +30,71 @@ impl MetricRecorder {
Ok(Self { Ok(Self {
epoch: epoch::mib()?, epoch: epoch::mib()?,
inner: Metrics { active: stats::active::mib()?,
active_bytes: JemallocGaugeFamily(stats::active::mib()?), active_gauge: register_int_gauge_with_registry!(
allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?), "jemalloc_active_bytes",
mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?), "Total number of bytes in active pages allocated by the process",
metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?), registry
resident_bytes: JemallocGaugeFamily(stats::resident::mib()?), )?,
retained_bytes: JemallocGaugeFamily(stats::retained::mib()?), allocated: stats::allocated::mib()?,
}, allocated_gauge: register_int_gauge_with_registry!(
"jemalloc_allocated_bytes",
"Total number of bytes allocated by the process",
registry
)?,
mapped: stats::mapped::mib()?,
mapped_gauge: register_int_gauge_with_registry!(
"jemalloc_mapped_bytes",
"Total number of bytes in active extents mapped by the allocator",
registry
)?,
metadata: stats::metadata::mib()?,
metadata_gauge: register_int_gauge_with_registry!(
"jemalloc_metadata_bytes",
"Total number of bytes dedicated to jemalloc metadata",
registry
)?,
resident: stats::resident::mib()?,
resident_gauge: register_int_gauge_with_registry!(
"jemalloc_resident_bytes",
"Total number of bytes in physically resident data pages mapped by the allocator",
registry
)?,
retained: stats::retained::mib()?,
retained_gauge: register_int_gauge_with_registry!(
"jemalloc_retained_bytes",
"Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system",
registry
)?,
})
}
fn _poll(&self) -> Result<(), anyhow::Error> {
self.epoch.advance()?;
self.active_gauge.set(self.active.read()? as i64);
self.allocated_gauge.set(self.allocated.read()? as i64);
self.mapped_gauge.set(self.mapped.read()? as i64);
self.metadata_gauge.set(self.metadata.read()? as i64);
self.resident_gauge.set(self.resident.read()? as i64);
self.retained_gauge.set(self.retained.read()? as i64);
Ok(())
}
#[inline]
pub fn poll(&self) {
if let Err(error) = self._poll() {
tracing::warn!(%error, "Failed to poll jemalloc stats");
}
}
pub fn start(self) -> tokio::task::JoinHandle<()> {
tokio::task::spawn(async move {
let mut interval = tokio::time::interval(Duration::from_secs(15));
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
loop {
self.poll();
interval.tick().await;
}
}) })
} }
} }
struct JemallocGauge<T>(PhantomData<T>);
impl<T> Default for JemallocGauge<T> {
fn default() -> Self {
JemallocGauge(PhantomData)
}
}
impl<T> MetricType for JemallocGauge<T> {
type Metadata = T;
}
struct JemallocGaugeFamily<T>(T);
impl<M, T: Encoding> MetricFamilyEncoding<T> for JemallocGaugeFamily<M>
where
JemallocGauge<M>: MetricEncoding<T, Metadata = M>,
{
fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> {
JemallocGauge::write_type(&name, enc)?;
JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc)
}
}
macro_rules! jemalloc_gauge {
($stat:ident, $mib:ident) => {
impl<W: std::io::Write> MetricEncoding<TextEncoder<W>> for JemallocGauge<stats::$mib> {
fn write_type(
name: impl MetricNameEncoder,
enc: &mut TextEncoder<W>,
) -> Result<(), std::io::Error> {
GaugeState::write_type(name, enc)
}
fn collect_into(
&self,
mib: &stats::$mib,
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut TextEncoder<W>,
) -> Result<(), std::io::Error> {
if let Ok(v) = mib.read() {
enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?;
}
Ok(())
}
}
};
}
jemalloc_gauge!(active, active_mib);
jemalloc_gauge!(allocated, allocated_mib);
jemalloc_gauge!(mapped, mapped_mib);
jemalloc_gauge!(metadata, metadata_mib);
jemalloc_gauge!(resident, resident_mib);
jemalloc_gauge!(retained, retained_mib);
#[allow(non_camel_case_types)]
pub struct dump;
impl dump {
pub fn mib() -> tikv_jemalloc_ctl::Result<dump_mib> {
Ok(dump_mib(b"prof.dump\0".as_slice().name().mib_str()?))
}
}
#[repr(transparent)]
#[derive(Copy, Clone)]
#[allow(non_camel_case_types)]
pub struct dump_mib(pub MibStr<[usize; 2]>);
impl dump_mib {
pub fn write(self, value: &'static CStr) -> tikv_jemalloc_ctl::Result<()> {
// No support for Access<CStr> yet.
// self.0.write(value)
let mib = [self.0[0], self.0[1]];
raw::write_str_mib(&mib, value.to_bytes_with_nul())
}
}
#[allow(non_camel_case_types)]
pub struct active;
impl active {
pub fn name() -> &'static Name {
b"prof.active\0".as_slice().name()
}
}
impl active {
pub fn read() -> tikv_jemalloc_ctl::Result<bool> {
Self::name().read()
}
pub fn write(value: bool) -> tikv_jemalloc_ctl::Result<()> {
Self::name().write(value)
}
}
#[allow(non_camel_case_types)]
pub struct prof;
impl prof {
pub fn name() -> &'static Name {
b"opt.prof\0".as_slice().name()
}
}
impl prof {
pub fn read() -> tikv_jemalloc_ctl::Result<bool> {
Self::name().read()
}
}

View File

@@ -127,24 +127,6 @@ macro_rules! smol_str_wrapper {
}; };
} }
const POOLER_SUFFIX: &str = "-pooler";
pub trait Normalize {
fn normalize(&self) -> Self;
}
impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
fn normalize(&self) -> Self {
if self.as_ref().ends_with(POOLER_SUFFIX) {
let mut s = self.as_ref().to_string();
s.truncate(s.len() - POOLER_SUFFIX.len());
s.into()
} else {
self.clone()
}
}
}
// 90% of role name strings are 20 characters or less. // 90% of role name strings are 20 characters or less.
smol_str_wrapper!(RoleName); smol_str_wrapper!(RoleName);
// 50% of endpoint strings are 23 characters or less. // 50% of endpoint strings are 23 characters or less.
@@ -158,22 +140,3 @@ smol_str_wrapper!(ProjectId);
smol_str_wrapper!(EndpointCacheKey); smol_str_wrapper!(EndpointCacheKey);
smol_str_wrapper!(DbName); smol_str_wrapper!(DbName);
// Endpoints are a bit tricky. Rare they might be branches or projects.
impl EndpointId {
pub fn is_endpoint(&self) -> bool {
self.0.starts_with("ep-")
}
pub fn is_branch(&self) -> bool {
self.0.starts_with("br-")
}
pub fn is_project(&self) -> bool {
!self.is_endpoint() && !self.is_branch()
}
pub fn as_branch(&self) -> BranchId {
BranchId(self.0.clone())
}
pub fn as_project(&self) -> ProjectId {
ProjectId(self.0.clone())
}
}

View File

@@ -1,348 +1,176 @@
use std::sync::OnceLock; use ::metrics::{
exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
use lasso::ThreadedRodeo; register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
use measured::{ register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
label::StaticLabelSet, IntCounterVec, IntGauge, IntGaugeVec,
metric::{histogram::Thresholds, name::MetricName}, };
Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, use metrics::{
MetricGroup, register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter,
IntCounterPair,
}; };
use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
use once_cell::sync::Lazy;
use tokio::time::{self, Instant}; use tokio::time::{self, Instant};
use crate::console::messages::ColdStartInfo; use crate::console::messages::ColdStartInfo;
#[derive(MetricGroup)] pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
pub struct Metrics { register_int_counter_pair_vec!(
#[metric(namespace = "proxy")] "proxy_opened_db_connections_total",
pub proxy: ProxyMetrics, "Number of opened connections to a database.",
"proxy_closed_db_connections_total",
"Number of closed connections to a database.",
&["protocol"],
)
.unwrap()
});
#[metric(namespace = "wake_compute_lock")] pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
pub wake_compute_lock: ApiLockMetrics, register_int_counter_pair_vec!(
} "proxy_opened_client_connections_total",
"Number of opened connections from a client.",
"proxy_closed_client_connections_total",
"Number of closed connections from a client.",
&["protocol"],
)
.unwrap()
});
impl Metrics { pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
pub fn get() -> &'static Self { register_int_counter_pair_vec!(
static SELF: OnceLock<Metrics> = OnceLock::new(); "proxy_accepted_connections_total",
SELF.get_or_init(|| Metrics { "Number of client connections accepted.",
proxy: ProxyMetrics::default(), "proxy_closed_connections_total",
wake_compute_lock: ApiLockMetrics::new(), "Number of client connections closed.",
}) &["protocol"],
} )
} .unwrap()
});
#[derive(MetricGroup)] pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
#[metric(new())] register_histogram_vec!(
pub struct ProxyMetrics { "proxy_compute_connection_latency_seconds",
#[metric(flatten)] "Time it took for proxy to establish a connection to the compute endpoint",
pub db_connections: CounterPairVec<NumDbConnectionsGauge>, // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
#[metric(flatten)] // 3 * 6 * 2 * 2 = 72 counters
pub client_connections: CounterPairVec<NumClientConnectionsGauge>, &["protocol", "cold_start_info", "outcome", "excluded"],
#[metric(flatten)] // largest bucket = 2^16 * 0.5ms = 32s
pub connection_requests: CounterPairVec<NumConnectionRequestsGauge>, exponential_buckets(0.0005, 2.0, 16).unwrap(),
#[metric(flatten)] )
pub http_endpoint_pools: HttpEndpointPools, .unwrap()
});
/// Time it took for proxy to establish a connection to the compute endpoint. pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
// largest bucket = 2^16 * 0.5ms = 32s register_histogram_vec!(
#[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))] "proxy_console_request_latency",
pub compute_connection_latency_seconds: HistogramVec<ComputeConnectionLatencySet, 16>, "Time it took for proxy to establish a connection to the compute endpoint",
// proxy_wake_compute/proxy_get_role_info
/// Time it took for proxy to receive a response from control plane. &["request"],
#[metric(
// largest bucket = 2^16 * 0.2ms = 13s // largest bucket = 2^16 * 0.2ms = 13s
metadata = Thresholds::exponential_buckets(0.0002, 2.0), exponential_buckets(0.0002, 2.0, 16).unwrap(),
)] )
pub console_request_latency: HistogramVec<ConsoleRequestSet, 16>, .unwrap()
});
/// Time it takes to acquire a token to call console plane. pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
// largest bucket = 3^16 * 0.05ms = 2.15s register_int_counter_vec!(
#[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))] "proxy_allowed_ips_cache_misses",
pub control_plane_token_acquire_seconds: Histogram<16>, "Number of cache hits/misses for allowed ips",
// hit/miss
&["outcome"],
)
.unwrap()
});
/// Size of the HTTP request body lengths. pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
// smallest bucket = 16 bytes register_histogram!(
// largest bucket = 4^12 * 16 bytes = 256MB "proxy_control_plane_token_acquire_seconds",
#[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))] "Time it took for proxy to establish a connection to the compute endpoint",
pub http_conn_content_length_bytes: HistogramVec<StaticLabelSet<HttpDirection>, 12>, // largest bucket = 3^16 * 0.05ms = 2.15s
exponential_buckets(0.00005, 3.0, 16).unwrap(),
)
.unwrap()
});
/// Time it takes to reclaim unused connection pools. pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
#[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] register_int_gauge_vec!(
pub http_pool_reclaimation_lag_seconds: Histogram<16>, "semaphore_control_plane_limit",
"Current limit of the semaphore control plane",
&["limit"], // 2 counters
)
.unwrap()
});
/// Number of opened connections to a database. pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
pub http_pool_opened_connections: Gauge, register_int_counter_vec!(
"proxy_accepted_connections_by_sni",
"Number of connections (per sni).",
&["kind"],
)
.unwrap()
});
/// Number of cache hits/misses for allowed ips. pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
pub allowed_ips_cache_misses: CounterVec<StaticLabelSet<CacheOutcome>>, register_histogram!(
"proxy_allowed_ips_number",
"Number of allowed ips",
vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
)
.unwrap()
});
/// Number of allowed ips pub static HTTP_CONTENT_LENGTH: Lazy<HistogramVec> = Lazy::new(|| {
#[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] register_histogram_vec!(
pub allowed_ips_number: Histogram<10>, "proxy_http_conn_content_length_bytes",
"Number of bytes the HTTP response content consumes",
// request/response
&["direction"],
// smallest bucket = 16 bytes
// largest bucket = 4^12 * 16 bytes = 256MB
exponential_buckets(16.0, 4.0, 12).unwrap()
)
.unwrap()
});
/// Number of connections (per sni). pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
pub accepted_connections_by_sni: CounterVec<StaticLabelSet<SniKind>>, register_histogram!(
"proxy_http_pool_reclaimation_lag_seconds",
"Time it takes to reclaim unused connection pools",
// 1us -> 65ms
exponential_buckets(1e-6, 2.0, 16).unwrap(),
)
.unwrap()
});
/// Number of connection failures (per kind). pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
pub connection_failures_total: CounterVec<StaticLabelSet<ConnectionFailureKind>>, register_int_counter_pair!(
"proxy_http_pool_endpoints_registered_total",
"Number of endpoints we have registered pools for",
"proxy_http_pool_endpoints_unregistered_total",
"Number of endpoints we have unregistered pools for",
)
.unwrap()
});
/// Number of wake-up failures (per kind). pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
pub connection_failures_breakdown: CounterVec<ConnectionFailuresBreakdownSet>, register_int_gauge!(
"proxy_http_pool_opened_connections",
"Number of opened connections to a database.",
)
.unwrap()
});
/// Number of bytes sent/received between all clients and backends. pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
pub io_bytes: CounterVec<StaticLabelSet<Direction>>, register_int_counter_vec!(
"proxy_cancellation_requests_total",
"Number of cancellation requests (per found/not_found).",
&["source", "kind"],
)
.unwrap()
});
/// Number of errors by a given classification. pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
pub errors_total: CounterVec<StaticLabelSet<crate::error::ErrorKind>>, pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
/// Number of cancellation requests (per found/not_found).
pub cancellation_requests_total: CounterVec<CancellationRequestSet>,
/// Number of errors by a given classification
pub redis_errors_total: CounterVec<RedisErrorsSet>,
/// Number of TLS handshake failures
pub tls_handshake_failures: Counter,
/// Number of connection requests affected by authentication rate limits
pub requests_auth_rate_limits_total: Counter,
/// HLL approximate cardinality of endpoints that are connecting
pub connecting_endpoints: HyperLogLogVec<StaticLabelSet<Protocol>, 32>,
/// Number of endpoints affected by errors of a given classification
pub endpoints_affected_by_errors: HyperLogLogVec<StaticLabelSet<crate::error::ErrorKind>, 32>,
/// Number of endpoints affected by authentication rate limits
pub endpoints_auth_rate_limits: HyperLogLog<32>,
/// Number of invalid endpoints (per protocol, per rejected).
pub invalid_endpoints_total: CounterVec<InvalidEndpointsSet>,
}
#[derive(MetricGroup)]
#[metric(new())]
pub struct ApiLockMetrics {
/// Number of semaphores registered in this api lock
pub semaphores_registered: Counter,
/// Number of semaphores unregistered in this api lock
pub semaphores_unregistered: Counter,
/// Time it takes to reclaim unused semaphores in the api lock
#[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))]
pub reclamation_lag_seconds: Histogram<16>,
/// Time it takes to acquire a semaphore lock
#[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))]
pub semaphore_acquire_seconds: Histogram<16>,
}
impl Default for ProxyMetrics {
fn default() -> Self {
Self::new()
}
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "direction")]
pub enum HttpDirection {
Request,
Response,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "direction")]
pub enum Direction {
Tx,
Rx,
}
#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
#[label(singleton = "protocol")]
pub enum Protocol {
Http,
Ws,
Tcp,
SniRouter,
}
impl Protocol {
pub fn as_str(&self) -> &'static str {
match self {
Protocol::Http => "http",
Protocol::Ws => "ws",
Protocol::Tcp => "tcp",
Protocol::SniRouter => "sni_router",
}
}
}
impl std::fmt::Display for Protocol {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.as_str())
}
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
pub enum Bool {
True,
False,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "outcome")]
pub enum Outcome {
Success,
Failed,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "outcome")]
pub enum CacheOutcome {
Hit,
Miss,
}
#[derive(LabelGroup)]
#[label(set = ConsoleRequestSet)]
pub struct ConsoleRequest<'a> {
#[label(dynamic_with = ThreadedRodeo, default)]
pub request: &'a str,
}
#[derive(MetricGroup, Default)]
pub struct HttpEndpointPools {
/// Number of endpoints we have registered pools for
pub http_pool_endpoints_registered_total: Counter,
/// Number of endpoints we have unregistered pools for
pub http_pool_endpoints_unregistered_total: Counter,
}
pub struct HttpEndpointPoolsGuard<'a> {
dec: &'a Counter,
}
impl Drop for HttpEndpointPoolsGuard<'_> {
fn drop(&mut self) {
self.dec.inc();
}
}
impl HttpEndpointPools {
pub fn guard(&self) -> HttpEndpointPoolsGuard {
self.http_pool_endpoints_registered_total.inc();
HttpEndpointPoolsGuard {
dec: &self.http_pool_endpoints_unregistered_total,
}
}
}
pub struct NumDbConnectionsGauge;
impl CounterPairAssoc for NumDbConnectionsGauge {
const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total");
const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total");
const INC_HELP: &'static str = "Number of opened connections to a database.";
const DEC_HELP: &'static str = "Number of closed connections to a database.";
type LabelGroupSet = StaticLabelSet<Protocol>;
}
pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>;
pub struct NumClientConnectionsGauge;
impl CounterPairAssoc for NumClientConnectionsGauge {
const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total");
const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total");
const INC_HELP: &'static str = "Number of opened connections from a client.";
const DEC_HELP: &'static str = "Number of closed connections from a client.";
type LabelGroupSet = StaticLabelSet<Protocol>;
}
pub type NumClientConnectionsGuard<'a> =
metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>;
pub struct NumConnectionRequestsGauge;
impl CounterPairAssoc for NumConnectionRequestsGauge {
const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total");
const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total");
const INC_HELP: &'static str = "Number of client connections accepted.";
const DEC_HELP: &'static str = "Number of client connections closed.";
type LabelGroupSet = StaticLabelSet<Protocol>;
}
pub type NumConnectionRequestsGuard<'a> =
metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>;
#[derive(LabelGroup)]
#[label(set = ComputeConnectionLatencySet)]
pub struct ComputeConnectionLatencyGroup {
protocol: Protocol,
cold_start_info: ColdStartInfo,
outcome: ConnectOutcome,
excluded: LatencyExclusions,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
pub enum LatencyExclusions {
Client,
ClientAndCplane,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "kind")]
pub enum SniKind {
Sni,
NoSni,
PasswordHack,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "kind")]
pub enum ConnectionFailureKind {
ComputeCached,
ComputeUncached,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "kind")]
pub enum WakeupFailureKind {
BadComputeAddress,
ApiTransportError,
QuotaExceeded,
ApiConsoleLocked,
ApiConsoleBadRequest,
ApiConsoleOtherServerError,
ApiConsoleOtherError,
TimeoutError,
}
#[derive(LabelGroup)]
#[label(set = ConnectionFailuresBreakdownSet)]
pub struct ConnectionFailuresBreakdownGroup {
pub kind: WakeupFailureKind,
pub retry: Bool,
}
#[derive(LabelGroup, Copy, Clone)]
#[label(set = RedisErrorsSet)]
pub struct RedisErrors<'a> {
#[label(dynamic_with = ThreadedRodeo, default)]
pub channel: &'a str,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
pub enum CancellationSource {
FromClient,
FromRedis,
Local,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
pub enum CancellationOutcome {
NotFound,
Found,
}
#[derive(LabelGroup)]
#[label(set = CancellationRequestSet)]
pub struct CancellationRequest {
pub source: CancellationSource,
pub kind: CancellationOutcome,
}
pub enum Waiting { pub enum Waiting {
Cplane, Cplane,
@@ -357,6 +185,20 @@ struct Accumulated {
compute: time::Duration, compute: time::Duration,
} }
enum Outcome {
Success,
Failed,
}
impl Outcome {
fn as_str(&self) -> &'static str {
match self {
Outcome::Success => "success",
Outcome::Failed => "failed",
}
}
}
pub struct LatencyTimer { pub struct LatencyTimer {
// time since the stopwatch was started // time since the stopwatch was started
start: time::Instant, start: time::Instant,
@@ -365,9 +207,9 @@ pub struct LatencyTimer {
// accumulated time on the stopwatch // accumulated time on the stopwatch
accumulated: Accumulated, accumulated: Accumulated,
// label data // label data
protocol: Protocol, protocol: &'static str,
cold_start_info: ColdStartInfo, cold_start_info: ColdStartInfo,
outcome: ConnectOutcome, outcome: Outcome,
} }
pub struct LatencyTimerPause<'a> { pub struct LatencyTimerPause<'a> {
@@ -377,7 +219,7 @@ pub struct LatencyTimerPause<'a> {
} }
impl LatencyTimer { impl LatencyTimer {
pub fn new(protocol: Protocol) -> Self { pub fn new(protocol: &'static str) -> Self {
Self { Self {
start: time::Instant::now(), start: time::Instant::now(),
stop: None, stop: None,
@@ -385,7 +227,7 @@ impl LatencyTimer {
protocol, protocol,
cold_start_info: ColdStartInfo::Unknown, cold_start_info: ColdStartInfo::Unknown,
// assume failed unless otherwise specified // assume failed unless otherwise specified
outcome: ConnectOutcome::Failed, outcome: Outcome::Failed,
} }
} }
@@ -406,7 +248,7 @@ impl LatencyTimer {
self.stop = Some(time::Instant::now()); self.stop = Some(time::Instant::now());
// success // success
self.outcome = ConnectOutcome::Success; self.outcome = Outcome::Success;
} }
} }
@@ -421,62 +263,128 @@ impl Drop for LatencyTimerPause<'_> {
} }
} }
#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
pub enum ConnectOutcome {
Success,
Failed,
}
impl Drop for LatencyTimer { impl Drop for LatencyTimer {
fn drop(&mut self) { fn drop(&mut self) {
let duration = self let duration = self
.stop .stop
.unwrap_or_else(time::Instant::now) .unwrap_or_else(time::Instant::now)
.duration_since(self.start); .duration_since(self.start);
// Excluding cplane communication from the accumulated time.
let metric = &Metrics::get().proxy.compute_connection_latency_seconds; COMPUTE_CONNECTION_LATENCY
.with_label_values(&[
// Excluding client communication from the accumulated time. self.protocol,
metric.observe( self.cold_start_info.as_str(),
ComputeConnectionLatencyGroup { self.outcome.as_str(),
protocol: self.protocol, "client",
cold_start_info: self.cold_start_info, ])
outcome: self.outcome, .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
excluded: LatencyExclusions::Client,
},
duration
.saturating_sub(self.accumulated.client)
.as_secs_f64(),
);
// Exclude client and cplane communication from the accumulated time. // Exclude client and cplane communication from the accumulated time.
let accumulated_total = self.accumulated.client + self.accumulated.cplane; let accumulated_total = self.accumulated.client + self.accumulated.cplane;
metric.observe( COMPUTE_CONNECTION_LATENCY
ComputeConnectionLatencyGroup { .with_label_values(&[
protocol: self.protocol, self.protocol,
cold_start_info: self.cold_start_info, self.cold_start_info.as_str(),
outcome: self.outcome, self.outcome.as_str(),
excluded: LatencyExclusions::ClientAndCplane, "client_and_cplane",
}, ])
duration.saturating_sub(accumulated_total).as_secs_f64(), .observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
);
} }
} }
impl From<bool> for Bool { pub static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
fn from(value: bool) -> Self { register_int_counter_vec!(
if value { "proxy_connection_failures_total",
Bool::True "Number of connection failures (per kind).",
} else { &["kind"],
Bool::False )
} .unwrap()
});
pub static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_connection_failures_breakdown",
"Number of wake-up failures (per kind).",
&["retry", "kind"],
)
.unwrap()
});
pub static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_io_bytes",
"Number of bytes sent/received between all clients and backends.",
&["direction"],
)
.unwrap()
});
pub const fn bool_to_str(x: bool) -> &'static str {
if x {
"true"
} else {
"false"
} }
} }
#[derive(LabelGroup)] pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
#[label(set = InvalidEndpointsSet)] register_hll_vec!(
pub struct InvalidEndpointsGroup { 32,
pub protocol: Protocol, "proxy_connecting_endpoints",
pub rejected: Bool, "HLL approximate cardinality of endpoints that are connecting",
pub outcome: ConnectOutcome, &["protocol"],
} )
.unwrap()
});
pub static ERROR_BY_KIND: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_errors_total",
"Number of errors by a given classification",
&["type"],
)
.unwrap()
});
pub static ENDPOINT_ERRORS_BY_KIND: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
register_hll_vec!(
32,
"proxy_endpoints_affected_by_errors",
"Number of endpoints affected by errors of a given classification",
&["type"],
)
.unwrap()
});
pub static REDIS_BROKEN_MESSAGES: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_redis_errors_total",
"Number of errors by a given classification",
&["channel"],
)
.unwrap()
});
pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"proxy_tls_handshake_failures",
"Number of TLS handshake failures",
)
.unwrap()
});
pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy<HyperLogLog<32>> = Lazy::new(|| {
register_hll!(
32,
"proxy_endpoints_auth_rate_limits",
"Number of endpoints affected by authentication rate limits",
)
.unwrap()
});
pub static AUTH_RATE_LIMIT_HITS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"proxy_requests_auth_rate_limits_total",
"Number of connection requests affected by authentication rate limits",
)
.unwrap()
});

View File

@@ -5,13 +5,19 @@ use std::{
io, io,
net::SocketAddr, net::SocketAddr,
pin::{pin, Pin}, pin::{pin, Pin},
sync::Mutex,
task::{ready, Context, Poll}, task::{ready, Context, Poll},
}; };
use bytes::{Buf, BytesMut}; use bytes::{Buf, BytesMut};
use hyper::server::conn::AddrIncoming; use hyper::server::accept::Accept;
use hyper::server::conn::{AddrIncoming, AddrStream};
use metrics::IntCounterPairGuard;
use pin_project_lite::pin_project; use pin_project_lite::pin_project;
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
use uuid::Uuid;
use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
pub struct ProxyProtocolAccept { pub struct ProxyProtocolAccept {
pub incoming: AddrIncoming, pub incoming: AddrIncoming,
@@ -325,6 +331,103 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
} }
} }
impl Accept for ProxyProtocolAccept {
type Conn = WithConnectionGuard<WithClientIp<AddrStream>>;
type Error = io::Error;
fn poll_accept(
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
let conn_id = uuid::Uuid::new_v4();
let span = tracing::info_span!("http_conn", ?conn_id);
{
let _enter = span.enter();
tracing::info!("accepted new TCP connection");
}
let Some(conn) = conn else {
return Poll::Ready(None);
};
Poll::Ready(Some(Ok(WithConnectionGuard {
inner: WithClientIp::new(conn),
connection_id: Uuid::new_v4(),
gauge: Mutex::new(Some(
NUM_CLIENT_CONNECTION_GAUGE
.with_label_values(&[self.protocol])
.guard(),
)),
span,
})))
}
}
pin_project! {
pub struct WithConnectionGuard<T> {
#[pin]
pub inner: T,
pub connection_id: Uuid,
pub gauge: Mutex<Option<IntCounterPairGuard>>,
pub span: tracing::Span,
}
impl<S> PinnedDrop for WithConnectionGuard<S> {
fn drop(this: Pin<&mut Self>) {
let _enter = this.span.enter();
tracing::info!("HTTP connection closed")
}
}
}
impl<T: AsyncWrite> AsyncWrite for WithConnectionGuard<T> {
#[inline]
fn poll_write(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &[u8],
) -> Poll<Result<usize, io::Error>> {
self.project().inner.poll_write(cx, buf)
}
#[inline]
fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
self.project().inner.poll_flush(cx)
}
#[inline]
fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
self.project().inner.poll_shutdown(cx)
}
#[inline]
fn poll_write_vectored(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
bufs: &[io::IoSlice<'_>],
) -> Poll<Result<usize, io::Error>> {
self.project().inner.poll_write_vectored(cx, bufs)
}
#[inline]
fn is_write_vectored(&self) -> bool {
self.inner.is_write_vectored()
}
}
impl<T: AsyncRead> AsyncRead for WithConnectionGuard<T> {
fn poll_read(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &mut ReadBuf<'_>,
) -> Poll<io::Result<()>> {
self.project().inner.poll_read(cx, buf)
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::pin::pin; use std::pin::pin;

View File

@@ -7,7 +7,6 @@ pub mod handshake;
pub mod passthrough; pub mod passthrough;
pub mod retry; pub mod retry;
pub mod wake_compute; pub mod wake_compute;
pub use copy_bidirectional::copy_bidirectional_client_compute;
use crate::{ use crate::{
auth, auth,
@@ -16,14 +15,16 @@ use crate::{
config::{ProxyConfig, TlsConfig}, config::{ProxyConfig, TlsConfig},
context::RequestMonitoring, context::RequestMonitoring,
error::ReportableError, error::ReportableError,
metrics::{Metrics, NumClientConnectionsGuard}, metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
protocol2::WithClientIp, protocol2::WithClientIp,
proxy::handshake::{handshake, HandshakeData}, proxy::handshake::{handshake, HandshakeData},
rate_limiter::EndpointRateLimiter,
stream::{PqStream, Stream}, stream::{PqStream, Stream},
EndpointCacheKey, EndpointCacheKey,
}; };
use futures::TryFutureExt; use futures::TryFutureExt;
use itertools::Itertools; use itertools::Itertools;
use metrics::IntCounterPairGuard;
use once_cell::sync::OnceCell; use once_cell::sync::OnceCell;
use pq_proto::{BeMessage as Be, StartupMessageParams}; use pq_proto::{BeMessage as Be, StartupMessageParams};
use regex::Regex; use regex::Regex;
@@ -60,6 +61,7 @@ pub async fn task_main(
config: &'static ProxyConfig, config: &'static ProxyConfig,
listener: tokio::net::TcpListener, listener: tokio::net::TcpListener,
cancellation_token: CancellationToken, cancellation_token: CancellationToken,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
cancellation_handler: Arc<CancellationHandlerMain>, cancellation_handler: Arc<CancellationHandlerMain>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
scopeguard::defer! { scopeguard::defer! {
@@ -77,13 +79,13 @@ pub async fn task_main(
{ {
let (socket, peer_addr) = accept_result?; let (socket, peer_addr) = accept_result?;
let conn_gauge = Metrics::get() let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
.proxy .with_label_values(&["tcp"])
.client_connections .guard();
.guard(crate::metrics::Protocol::Tcp);
let session_id = uuid::Uuid::new_v4(); let session_id = uuid::Uuid::new_v4();
let cancellation_handler = Arc::clone(&cancellation_handler); let cancellation_handler = Arc::clone(&cancellation_handler);
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection"); tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
@@ -111,12 +113,7 @@ pub async fn task_main(
}, },
}; };
let mut ctx = RequestMonitoring::new( let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
session_id,
peer_addr,
crate::metrics::Protocol::Tcp,
&config.region,
);
let span = ctx.span.clone(); let span = ctx.span.clone();
let res = handle_client( let res = handle_client(
@@ -125,6 +122,7 @@ pub async fn task_main(
cancellation_handler, cancellation_handler,
socket, socket,
ClientMode::Tcp, ClientMode::Tcp,
endpoint_rate_limiter,
conn_gauge, conn_gauge,
) )
.instrument(span.clone()) .instrument(span.clone())
@@ -238,23 +236,20 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
cancellation_handler: Arc<CancellationHandlerMain>, cancellation_handler: Arc<CancellationHandlerMain>,
stream: S, stream: S,
mode: ClientMode, mode: ClientMode,
conn_gauge: NumClientConnectionsGuard<'static>, endpoint_rate_limiter: Arc<EndpointRateLimiter>,
conn_gauge: IntCounterPairGuard,
) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> { ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
info!( info!("handling interactive connection from client");
protocol = %ctx.protocol,
"handling interactive connection from client"
);
let metrics = &Metrics::get().proxy;
let proto = ctx.protocol; let proto = ctx.protocol;
// let _client_gauge = metrics.client_connections.guard(proto); let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
let _request_gauge = metrics.connection_requests.guard(proto); .with_label_values(&[proto])
.guard();
let tls = config.tls_config.as_ref(); let tls = config.tls_config.as_ref();
let record_handshake_error = !ctx.has_private_peer_addr();
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client); let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error); let do_handshake = handshake(stream, mode.handshake_tls(tls));
let (mut stream, params) = let (mut stream, params) =
match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Startup(stream, params) => (stream, params),
@@ -283,6 +278,15 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
Err(e) => stream.throw_error(e).await?, Err(e) => stream.throw_error(e).await?,
}; };
// check rate limit
if let Some(ep) = user_info.get_endpoint() {
if !endpoint_rate_limiter.check(ep, 1) {
return stream
.throw_error(auth::AuthError::too_many_connections())
.await?;
}
}
let user = user_info.get_user().to_owned(); let user = user_info.get_user().to_owned();
let user_info = match user_info let user_info = match user_info
.authenticate( .authenticate(

View File

@@ -4,7 +4,7 @@ use crate::{
console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo}, console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
context::RequestMonitoring, context::RequestMonitoring,
error::ReportableError, error::ReportableError,
metrics::{ConnectionFailureKind, Metrics}, metrics::NUM_CONNECTION_FAILURES,
proxy::{ proxy::{
retry::{retry_after, ShouldRetry}, retry::{retry_after, ShouldRetry},
wake_compute::wake_compute, wake_compute::wake_compute,
@@ -27,10 +27,10 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
warn!("invalidating stalled compute node info cache entry"); warn!("invalidating stalled compute node info cache entry");
} }
let label = match is_cached { let label = match is_cached {
true => ConnectionFailureKind::ComputeCached, true => "compute_cached",
false => ConnectionFailureKind::ComputeUncached, false => "compute_uncached",
}; };
Metrics::get().proxy.connection_failures_total.inc(label); NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
node_info.invalidate() node_info.invalidate()
} }

Some files were not shown because too many files have changed in this diff Show More