diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index 1ecb5ecc7e..f84beff20c 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -150,7 +150,7 @@ runs: # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work, # and to keep files on the host to upload them to the database - time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}" + time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/" # Generate redirect cat < ${WORKDIR}/index.html diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index f1eea34ab9..dea3fc2357 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -10,7 +10,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build outputs: dsn: description: 'Created Branch DSN (for main database)' diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml index f8cd351dd9..8acba7ad00 100644 --- a/.github/actions/neon-branch-delete/action.yml +++ b/.github/actions/neon-branch-delete/action.yml @@ -13,7 +13,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build runs: using: "composite" diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index ae6464990e..7f0e599b97 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -13,7 +13,7 @@ inputs: default: 15 api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build provisioner: desctiption: 'k8s-pod or k8s-neonvm' default: 'k8s-pod' diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml index adc8510a34..b8ec6cac70 100644 --- a/.github/actions/neon-project-delete/action.yml +++ b/.github/actions/neon-project-delete/action.yml @@ -10,7 +10,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build runs: using: "composite" diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml index 69c48d86b9..ab616d17e2 100644 --- a/.github/workflows/approved-for-ci-run.yml +++ b/.github/workflows/approved-for-ci-run.yml @@ -18,6 +18,7 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number }} + cancel-in-progress: false env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 251423e701..c527cef1ac 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -21,6 +21,7 @@ defaults: concurrency: group: build-build-tools-image-${{ inputs.image-tag }} + cancel-in-progress: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 71cb56df91..a6fdd1953e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -480,6 +480,8 @@ jobs: BUILD_TAG: ${{ needs.tag.outputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring PAGESERVER_GET_VECTORED_IMPL: vectored + PAGESERVER_GET_IMPL: vectored + PAGESERVER_VALIDATE_VEC_GET: true # Temporary disable this step until we figure out why it's so flaky # Ref https://github.com/neondatabase/neon/issues/4540 @@ -559,6 +561,9 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + PAGESERVER_GET_VECTORED_IMPL: vectored + PAGESERVER_GET_IMPL: vectored + PAGESERVER_VALIDATE_VEC_GET: false # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones @@ -738,7 +743,7 @@ jobs: run: | mkdir -p .docker-custom echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - - uses: docker/setup-buildx-action@v3 + - uses: docker/setup-buildx-action@v2 - uses: docker/login-action@v3 with: @@ -795,7 +800,7 @@ jobs: run: | mkdir -p .docker-custom echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - - uses: docker/setup-buildx-action@v3 + - uses: docker/setup-buildx-action@v2 with: # Disable parallelism for docker buildkit. # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner. @@ -868,7 +873,7 @@ jobs: run: shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.23.2 + VM_BUILDER_VERSION: v0.28.1 steps: - name: Checkout @@ -1136,8 +1141,6 @@ jobs: -f deployPreprodRegion=true gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \ - -f deployPgSniRouter=false \ - -f deployProxy=false \ -f deployStorage=true \ -f deployStorageBroker=true \ -f deployStorageController=true \ diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml index 28646dfc19..a1e22cf93f 100644 --- a/.github/workflows/check-build-tools-image.yml +++ b/.github/workflows/check-build-tools-image.yml @@ -28,7 +28,9 @@ jobs: - name: Get build-tools image tag for the current commit id: get-build-tools-tag env: - COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs, + # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly. + COMMIT_SHA: ${{ github.sha }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | LAST_BUILD_TOOLS_SHA=$( diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index c941692066..d495a158e8 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -20,6 +20,7 @@ defaults: concurrency: group: pin-build-tools-image-${{ inputs.from-tag }} + cancel-in-progress: false permissions: {} diff --git a/Cargo.lock b/Cargo.lock index 9d78a382ea..c71a0e550e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -270,6 +270,12 @@ dependencies = [ "critical-section", ] +[[package]] +name = "atomic-take" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3" + [[package]] name = "autocfg" version = "1.1.0" @@ -298,7 +304,7 @@ dependencies = [ "fastrand 2.0.0", "hex", "http 0.2.9", - "hyper", + "hyper 0.14.26", "ring 0.17.6", "time", "tokio", @@ -335,7 +341,7 @@ dependencies = [ "bytes", "fastrand 2.0.0", "http 0.2.9", - "http-body", + "http-body 0.4.5", "percent-encoding", "pin-project-lite", "tracing", @@ -386,7 +392,7 @@ dependencies = [ "aws-types", "bytes", "http 0.2.9", - "http-body", + "http-body 0.4.5", "once_cell", "percent-encoding", "regex-lite", @@ -514,7 +520,7 @@ dependencies = [ "crc32fast", "hex", "http 0.2.9", - "http-body", + "http-body 0.4.5", "md-5", "pin-project-lite", "sha1", @@ -546,7 +552,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.9", - "http-body", + "http-body 0.4.5", "once_cell", "percent-encoding", "pin-project-lite", @@ -585,15 +591,15 @@ dependencies = [ "aws-smithy-types", "bytes", "fastrand 2.0.0", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", - "hyper-rustls", + "http-body 0.4.5", + "hyper 0.14.26", + "hyper-rustls 0.24.0", "once_cell", "pin-project-lite", "pin-utils", - "rustls 0.21.9", + "rustls 0.21.11", "tokio", "tracing", ] @@ -626,7 +632,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.9", - "http-body", + "http-body 0.4.5", "itoa", "num-integer", "pin-project-lite", @@ -675,10 +681,10 @@ dependencies = [ "bytes", "futures-util", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "itoa", - "matchit", + "matchit 0.7.0", "memchr", "mime", "percent-encoding", @@ -691,7 +697,7 @@ dependencies = [ "sha1", "sync_wrapper", "tokio", - "tokio-tungstenite", + "tokio-tungstenite 0.20.0", "tower", "tower-layer", "tower-service", @@ -707,7 +713,7 @@ dependencies = [ "bytes", "futures-util", "http 0.2.9", - "http-body", + "http-body 0.4.5", "mime", "rustversion", "tower-layer", @@ -716,9 +722,9 @@ dependencies = [ [[package]] name = "azure_core" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd" +checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7" dependencies = [ "async-trait", "base64 0.21.1", @@ -734,7 +740,7 @@ dependencies = [ "pin-project", "quick-xml", "rand 0.8.5", - "reqwest", + "reqwest 0.11.19", "rustc_version", "serde", "serde_json", @@ -746,9 +752,9 @@ dependencies = [ [[package]] name = "azure_identity" -version = "0.18.1" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8" +checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f" dependencies = [ "async-lock", "async-trait", @@ -766,9 +772,9 @@ dependencies = [ [[package]] name = "azure_storage" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1" +checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266" dependencies = [ "RustyXML", "async-lock", @@ -785,9 +791,9 @@ dependencies = [ [[package]] name = "azure_storage_blobs" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872" +checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94" dependencies = [ "RustyXML", "azure_core", @@ -806,9 +812,9 @@ dependencies = [ [[package]] name = "azure_svc_blobstorage" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389" +checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b" dependencies = [ "azure_core", "bytes", @@ -859,6 +865,12 @@ version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "base64-simd" version = "0.8.0" @@ -1124,7 +1136,7 @@ version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "syn 2.0.52", @@ -1196,7 +1208,7 @@ dependencies = [ "compute_api", "flate2", "futures", - "hyper", + "hyper 0.14.26", "nix 0.27.1", "notify", "num_cpus", @@ -1204,7 +1216,7 @@ dependencies = [ "postgres", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "rust-ini", "serde", "serde_json", @@ -1313,7 +1325,8 @@ dependencies = [ "git-version", "hex", "humantime", - "hyper", + "humantime-serde", + "hyper 0.14.26", "nix 0.27.1", "once_cell", "pageserver_api", @@ -1322,7 +1335,7 @@ dependencies = [ "postgres_backend", "postgres_connection", "regex", - "reqwest", + "reqwest 0.12.4", "safekeeper_api", "scopeguard", "serde", @@ -1462,12 +1475,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.15" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" -dependencies = [ - "cfg-if", -] +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "crossterm" @@ -1840,23 +1850,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.1" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys 0.48.0", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", "libc", + "windows-sys 0.52.0", ] [[package]] @@ -2213,6 +2212,25 @@ dependencies = [ "tracing", ] +[[package]] +name = "h2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http 1.1.0", + "indexmap 2.0.1", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "half" version = "1.8.2" @@ -2294,6 +2312,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.3.3" @@ -2345,6 +2369,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "hostname" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba" +dependencies = [ + "cfg-if", + "libc", + "windows 0.52.0", +] + [[package]] name = "http" version = "0.2.9" @@ -2378,6 +2413,29 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-body" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +dependencies = [ + "bytes", + "http 1.1.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "pin-project-lite", +] + [[package]] name = "http-types" version = "2.12.0" @@ -2436,9 +2494,9 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", + "http-body 0.4.5", "httparse", "httpdate", "itoa", @@ -2450,6 +2508,27 @@ dependencies = [ "want", ] +[[package]] +name = "hyper" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2 0.4.4", + "http 1.1.0", + "http-body 1.0.0", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + [[package]] name = "hyper-rustls" version = "0.24.0" @@ -2457,21 +2536,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" dependencies = [ "http 0.2.9", - "hyper", + "hyper 0.14.26", "log", - "rustls 0.21.9", + "rustls 0.21.11", "rustls-native-certs 0.6.2", "tokio", "tokio-rustls 0.24.0", ] +[[package]] +name = "hyper-rustls" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c" +dependencies = [ + "futures-util", + "http 1.1.0", + "hyper 1.2.0", + "hyper-util", + "rustls 0.22.4", + "rustls-pki-types", + "tokio", + "tokio-rustls 0.25.0", + "tower-service", +] + [[package]] name = "hyper-timeout" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" dependencies = [ - "hyper", + "hyper 0.14.26", "pin-project-lite", "tokio", "tokio-io-timeout", @@ -2484,7 +2580,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" dependencies = [ "bytes", - "hyper", + "hyper 0.14.26", "native-tls", "tokio", "tokio-native-tls", @@ -2492,15 +2588,37 @@ dependencies = [ [[package]] name = "hyper-tungstenite" -version = "0.11.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9" +checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad" dependencies = [ - "hyper", + "http-body-util", + "hyper 1.2.0", + "hyper-util", "pin-project-lite", "tokio", - "tokio-tungstenite", - "tungstenite", + "tokio-tungstenite 0.21.0", + "tungstenite 0.21.0", +] + +[[package]] +name = "hyper-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "hyper 1.2.0", + "pin-project-lite", + "socket2 0.5.5", + "tokio", + "tower", + "tower-service", + "tracing", ] [[package]] @@ -2514,7 +2632,7 @@ dependencies = [ "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows", + "windows 0.48.0", ] [[package]] @@ -2685,9 +2803,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.63" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" dependencies = [ "wasm-bindgen", ] @@ -2794,6 +2912,12 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" +[[package]] +name = "linux-raw-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + [[package]] name = "lock_api" version = "0.4.10" @@ -2831,6 +2955,12 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40" +[[package]] +name = "matchit" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed" + [[package]] name = "md-5" version = "0.10.5" @@ -2848,11 +2978,12 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "measured" -version = "0.0.13" +version = "0.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f" +checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5" dependencies = [ "bytes", + "crossbeam-utils", "hashbrown 0.14.0", "itoa", "lasso", @@ -2865,16 +2996,27 @@ dependencies = [ [[package]] name = "measured-derive" -version = "0.0.13" +version = "0.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80" +checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn 2.0.52", ] +[[package]] +name = "measured-process" +version = "0.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000" +dependencies = [ + "libc", + "measured", + "procfs 0.16.0", +] + [[package]] name = "memchr" version = "2.6.4" @@ -2914,8 +3056,10 @@ version = "0.1.0" dependencies = [ "chrono", "libc", + "measured", + "measured-process", "once_cell", - "procfs", + "procfs 0.14.2", "prometheus", "rand 0.8.5", "rand_distr", @@ -2950,16 +3094,6 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" -[[package]] -name = "mime_guess" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" -dependencies = [ - "mime", - "unicase", -] - [[package]] name = "minimal-lexical" version = "0.2.1" @@ -3086,6 +3220,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num" version = "0.4.1" @@ -3293,7 +3437,7 @@ dependencies = [ "bytes", "http 0.2.9", "opentelemetry_api", - "reqwest", + "reqwest 0.11.19", ] [[package]] @@ -3311,7 +3455,7 @@ dependencies = [ "opentelemetry_api", "opentelemetry_sdk", "prost", - "reqwest", + "reqwest 0.11.19", "thiserror", "tokio", "tonic", @@ -3422,6 +3566,12 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "p256" version = "0.11.1" @@ -3465,12 +3615,17 @@ dependencies = [ "camino", "clap", "git-version", + "humantime", "pageserver", + "pageserver_api", "postgres_ffi", + "remote_storage", "serde", "serde_json", "svg_fmt", "tokio", + "tokio-util", + "toml_edit", "utils", "workspace_hack", ] @@ -3505,7 +3660,7 @@ dependencies = [ "hex-literal", "humantime", "humantime-serde", - "hyper", + "hyper 0.14.26", "itertools", "leaky-bucket", "md5", @@ -3524,11 +3679,11 @@ dependencies = [ "postgres_connection", "postgres_ffi", "pq_proto", - "procfs", + "procfs 0.14.2", "rand 0.8.5", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "rpds", "scopeguard", "serde", @@ -3554,6 +3709,7 @@ dependencies = [ "tokio-util", "toml_edit", "tracing", + "twox-hash", "url", "utils", "walkdir", @@ -3597,7 +3753,7 @@ dependencies = [ "futures", "pageserver_api", "postgres", - "reqwest", + "reqwest 0.12.4", "serde", "thiserror", "tokio", @@ -3954,7 +4110,7 @@ dependencies = [ "futures", "once_cell", "pq_proto", - "rustls 0.22.2", + "rustls 0.22.4", "rustls-pemfile 2.1.1", "serde", "thiserror", @@ -4083,6 +4239,29 @@ dependencies = [ "rustix 0.36.16", ] +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.4.1", + "hex", + "lazy_static", + "procfs-core", + "rustix 0.38.28", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.4.1", + "hex", +] + [[package]] name = "prometheus" version = "0.13.3" @@ -4095,7 +4274,7 @@ dependencies = [ "libc", "memchr", "parking_lot 0.12.1", - "procfs", + "procfs 0.14.2", "thiserror", ] @@ -4116,7 +4295,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", - "heck", + "heck 0.4.1", "itertools", "lazy_static", "log", @@ -4160,6 +4339,7 @@ dependencies = [ "anyhow", "async-compression", "async-trait", + "atomic-take", "aws-config", "aws-sdk-iam", "aws-sigv4", @@ -4181,15 +4361,19 @@ dependencies = [ "hashlink", "hex", "hmac", - "hostname", + "hostname 0.3.1", "http 1.1.0", + "http-body-util", "humantime", - "hyper", + "hyper 0.14.26", + "hyper 1.2.0", "hyper-tungstenite", + "hyper-util", "ipnet", "itertools", "lasso", "md5", + "measured", "metrics", "native-tls", "once_cell", @@ -4210,14 +4394,14 @@ dependencies = [ "redis", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "reqwest-middleware", "reqwest-retry", "reqwest-tracing", "routerify", "rstest", "rustc-hash", - "rustls 0.22.2", + "rustls 0.22.4", "rustls-pemfile 2.1.1", "scopeguard", "serde", @@ -4237,6 +4421,7 @@ dependencies = [ "tokio-postgres-rustls", "tokio-rustls 0.25.0", "tokio-util", + "tower-service", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -4409,7 +4594,7 @@ dependencies = [ "itoa", "percent-encoding", "pin-project-lite", - "rustls 0.22.2", + "rustls 0.22.4", "rustls-native-certs 0.7.0", "rustls-pemfile 2.1.1", "rustls-pki-types", @@ -4518,7 +4703,7 @@ dependencies = [ "futures-util", "http-types", "humantime", - "hyper", + "hyper 0.14.26", "itertools", "metrics", "once_cell", @@ -4527,6 +4712,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "sync_wrapper", "test-context", "tokio", "tokio-stream", @@ -4548,73 +4734,110 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", - "hyper-rustls", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-tls", "ipnet", "js-sys", "log", "mime", - "mime_guess", "native-tls", "once_cell", "percent-encoding", "pin-project-lite", - "rustls 0.21.9", - "rustls-pemfile 1.0.2", "serde", "serde_json", "serde_urlencoded", "tokio", "tokio-native-tls", - "tokio-rustls 0.24.0", "tokio-util", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", - "wasm-streams", + "wasm-streams 0.3.0", "web-sys", - "webpki-roots 0.25.2", - "winreg", + "winreg 0.50.0", +] + +[[package]] +name = "reqwest" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "hyper 1.2.0", + "hyper-rustls 0.26.0", + "hyper-util", + "ipnet", + "js-sys", + "log", + "mime", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls 0.22.4", + "rustls-pemfile 2.1.1", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls 0.25.0", + "tokio-util", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams 0.4.0", + "web-sys", + "webpki-roots 0.26.1", + "winreg 0.52.0", ] [[package]] name = "reqwest-middleware" -version = "0.2.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d" +checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01" dependencies = [ "anyhow", "async-trait", - "http 0.2.9", - "reqwest", + "http 1.1.0", + "reqwest 0.12.4", "serde", - "task-local-extensions", "thiserror", + "tower-service", ] [[package]] name = "reqwest-retry" -version = "0.2.2" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4" +checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5" dependencies = [ "anyhow", "async-trait", "chrono", "futures", "getrandom 0.2.11", - "http 0.2.9", - "hyper", + "http 1.1.0", + "hyper 1.2.0", "parking_lot 0.11.2", - "reqwest", + "reqwest 0.12.4", "reqwest-middleware", "retry-policies", - "task-local-extensions", "tokio", "tracing", "wasm-timer", @@ -4622,27 +4845,27 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.4.7" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a0152176687dd5cfe7f507ac1cb1a491c679cfe483afd133a7db7aaea818bb3" +checksum = "b253954a1979e02eabccd7e9c3d61d8f86576108baa160775e7f160bb4e800a3" dependencies = [ "anyhow", "async-trait", "getrandom 0.2.11", - "matchit", + "http 1.1.0", + "matchit 0.8.2", "opentelemetry", - "reqwest", + "reqwest 0.12.4", "reqwest-middleware", - "task-local-extensions", "tracing", "tracing-opentelemetry", ] [[package]] name = "retry-policies" -version = "0.1.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b" +checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810" dependencies = [ "anyhow", "chrono", @@ -4696,7 +4919,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945" dependencies = [ "http 0.2.9", - "hyper", + "hyper 0.14.26", "lazy_static", "percent-encoding", "regex", @@ -4809,10 +5032,23 @@ dependencies = [ ] [[package]] -name = "rustls" -version = "0.21.9" +name = "rustix" +version = "0.38.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9" +checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +dependencies = [ + "bitflags 2.4.1", + "errno", + "libc", + "linux-raw-sys 0.4.13", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls" +version = "0.21.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4" dependencies = [ "log", "ring 0.17.6", @@ -4822,9 +5058,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.22.2" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41" +checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432" dependencies = [ "log", "ring 0.17.6", @@ -4938,6 +5174,7 @@ dependencies = [ "aws-smithy-async", "bincode", "bytes", + "camino", "chrono", "clap", "crc32c", @@ -4947,18 +5184,23 @@ dependencies = [ "hex", "histogram", "itertools", + "native-tls", "pageserver", "pageserver_api", + "postgres-native-tls", + "postgres_ffi", "rand 0.8.5", "remote_storage", - "reqwest", + "reqwest 0.12.4", "serde", "serde_json", "serde_with", "thiserror", "tokio", + "tokio-postgres", "tokio-rustls 0.25.0", "tokio-stream", + "tokio-util", "tracing", "tracing-appender", "tracing-subscriber", @@ -4988,7 +5230,7 @@ dependencies = [ "git-version", "hex", "humantime", - "hyper", + "hyper 0.14.26", "metrics", "once_cell", "parking_lot 0.12.1", @@ -5000,7 +5242,7 @@ dependencies = [ "rand 0.8.5", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "safekeeper_api", "scopeguard", "sd-notify", @@ -5130,13 +5372,13 @@ checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" [[package]] name = "sentry" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b" +checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02" dependencies = [ "httpdate", - "reqwest", - "rustls 0.21.9", + "reqwest 0.12.4", + "rustls 0.21.11", "sentry-backtrace", "sentry-contexts", "sentry-core", @@ -5149,9 +5391,9 @@ dependencies = [ [[package]] name = "sentry-backtrace" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac2bac6f310c4c4c4bb094d1541d32ae497f8c5c23405e85492cefdfe0971a9" +checksum = "a79194074f34b0cbe5dd33896e5928bbc6ab63a889bd9df2264af5acb186921e" dependencies = [ "backtrace", "once_cell", @@ -5161,11 +5403,11 @@ dependencies = [ [[package]] name = "sentry-contexts" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c3e17295cecdbacf66c5bd38d6e1147e09e1e9d824d2d5341f76638eda02a3a" +checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a" dependencies = [ - "hostname", + "hostname 0.4.0", "libc", "os_info", "rustc_version", @@ -5175,9 +5417,9 @@ dependencies = [ [[package]] name = "sentry-core" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8339474f587f36cb110fa1ed1b64229eea6d47b0b886375579297b7e47aeb055" +checksum = "46a75011ea1c0d5c46e9e57df03ce81f5c7f0a9e199086334a1f9c0a541e0826" dependencies = [ "once_cell", "rand 0.8.5", @@ -5188,9 +5430,9 @@ dependencies = [ [[package]] name = "sentry-panic" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "875b69f506da75bd664029eafb05f8934297d2990192896d17325f066bd665b7" +checksum = "2eaa3ecfa3c8750c78dcfd4637cfa2598b95b52897ed184b4dc77fcf7d95060d" dependencies = [ "sentry-backtrace", "sentry-core", @@ -5198,9 +5440,9 @@ dependencies = [ [[package]] name = "sentry-tracing" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89feead9bdd116f8035e89567651340fc382db29240b6c55ef412078b08d1aa3" +checksum = "f715932bf369a61b7256687c6f0554141b7ce097287e30e3f7ed6e9de82498fe" dependencies = [ "sentry-backtrace", "sentry-core", @@ -5210,13 +5452,13 @@ dependencies = [ [[package]] name = "sentry-types" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99dc599bd6646884fc403d593cdcb9816dd67c50cff3271c01ff123617908dcd" +checksum = "4519c900ce734f7a0eb7aba0869dfb225a7af8820634a7dd51449e3b093cfb7c" dependencies = [ "debugid", - "getrandom 0.2.11", "hex", + "rand 0.8.5", "serde", "serde_json", "thiserror", @@ -5473,9 +5715,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "smol_str" @@ -5567,7 +5809,7 @@ dependencies = [ "futures-util", "git-version", "humantime", - "hyper", + "hyper 0.14.26", "metrics", "once_cell", "parking_lot 0.12.1", @@ -5598,7 +5840,7 @@ dependencies = [ "git-version", "hex", "humantime", - "hyper", + "hyper 0.14.26", "itertools", "lasso", "measured", @@ -5608,10 +5850,12 @@ dependencies = [ "pageserver_client", "postgres_connection", "r2d2", - "reqwest", + "reqwest 0.12.4", "routerify", "serde", "serde_json", + "strum", + "strum_macros", "thiserror", "tokio", "tokio-util", @@ -5627,10 +5871,10 @@ dependencies = [ "anyhow", "clap", "comfy-table", - "hyper", + "hyper 0.14.26", "pageserver_api", "pageserver_client", - "reqwest", + "reqwest 0.12.4", "serde", "serde_json", "thiserror", @@ -5668,7 +5912,7 @@ version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "rustversion", @@ -5684,8 +5928,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "svg_fmt" version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499" +source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8" [[package]] name = "syn" @@ -5714,6 +5957,9 @@ name = "sync_wrapper" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +dependencies = [ + "futures-core", +] [[package]] name = "synstructure" @@ -6047,7 +6293,7 @@ checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677" dependencies = [ "futures", "ring 0.17.6", - "rustls 0.22.2", + "rustls 0.22.4", "tokio", "tokio-postgres", "tokio-rustls 0.25.0", @@ -6060,7 +6306,7 @@ version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" dependencies = [ - "rustls 0.21.9", + "rustls 0.21.11", "tokio", ] @@ -6070,7 +6316,7 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f" dependencies = [ - "rustls 0.22.2", + "rustls 0.22.4", "rustls-pki-types", "tokio", ] @@ -6110,7 +6356,19 @@ dependencies = [ "futures-util", "log", "tokio", - "tungstenite", + "tungstenite 0.20.1", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38" +dependencies = [ + "futures-util", + "log", + "tokio", + "tungstenite 0.21.0", ] [[package]] @@ -6177,10 +6435,10 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-timeout", "percent-encoding", "pin-project", @@ -6320,12 +6578,14 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.20.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc09e402904a5261e42cf27aea09ccb7d5318c6717a9eec3d8e2e65c56b18f19" +checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8" dependencies = [ "once_cell", "opentelemetry", + "opentelemetry_sdk", + "smallvec", "tracing", "tracing-core", "tracing-log", @@ -6349,6 +6609,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" dependencies = [ "matchers", + "nu-ansi-term", "once_cell", "regex", "serde", @@ -6366,11 +6627,11 @@ dependencies = [ name = "tracing-utils" version = "0.1.0" dependencies = [ - "hyper", + "hyper 0.14.26", "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", - "reqwest", + "reqwest 0.12.4", "tokio", "tracing", "tracing-opentelemetry", @@ -6403,6 +6664,25 @@ dependencies = [ "utf-8", ] +[[package]] +name = "tungstenite" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1" +dependencies = [ + "byteorder", + "bytes", + "data-encoding", + "http 1.1.0", + "httparse", + "log", + "rand 0.8.5", + "sha1", + "thiserror", + "url", + "utf-8", +] + [[package]] name = "twox-hash" version = "1.6.3" @@ -6437,15 +6717,6 @@ dependencies = [ "libc", ] -[[package]] -name = "unicase" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" -dependencies = [ - "version_check", -] - [[package]] name = "unicode-bidi" version = "0.3.13" @@ -6500,7 +6771,7 @@ dependencies = [ "base64 0.21.1", "log", "once_cell", - "rustls 0.21.9", + "rustls 0.21.11", "rustls-webpki 0.100.2", "url", "webpki-roots 0.23.1", @@ -6568,7 +6839,7 @@ dependencies = [ "hex", "hex-literal", "humantime", - "hyper", + "hyper 0.14.26", "jsonwebtoken", "leaky-bucket", "metrics", @@ -6727,9 +6998,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -6737,9 +7008,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" dependencies = [ "bumpalo", "log", @@ -6752,9 +7023,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.36" +version = "0.4.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e" +checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" dependencies = [ "cfg-if", "js-sys", @@ -6764,9 +7035,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6774,9 +7045,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", @@ -6787,9 +7058,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" [[package]] name = "wasm-streams" @@ -6804,6 +7075,19 @@ dependencies = [ "web-sys", ] +[[package]] +name = "wasm-streams" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "wasm-timer" version = "0.2.5" @@ -6821,9 +7105,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.63" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" dependencies = [ "js-sys", "wasm-bindgen", @@ -6844,6 +7128,15 @@ version = "0.25.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc" +[[package]] +name = "webpki-roots" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "which" version = "4.4.0" @@ -6895,6 +7188,25 @@ dependencies = [ "windows-targets 0.48.0", ] +[[package]] +name = "windows" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" +dependencies = [ + "windows-core", + "windows-targets 0.52.4", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.4", +] + [[package]] name = "windows-sys" version = "0.42.0" @@ -6928,6 +7240,15 @@ dependencies = [ "windows-targets 0.48.0", ] +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.4", +] + [[package]] name = "windows-targets" version = "0.42.2" @@ -6958,6 +7279,21 @@ dependencies = [ "windows_x86_64_msvc 0.48.0", ] +[[package]] +name = "windows-targets" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +dependencies = [ + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" @@ -6970,6 +7306,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" + [[package]] name = "windows_aarch64_msvc" version = "0.42.2" @@ -6982,6 +7324,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" + [[package]] name = "windows_i686_gnu" version = "0.42.2" @@ -6994,6 +7342,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +[[package]] +name = "windows_i686_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" + [[package]] name = "windows_i686_msvc" version = "0.42.2" @@ -7006,6 +7360,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +[[package]] +name = "windows_i686_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" + [[package]] name = "windows_x86_64_gnu" version = "0.42.2" @@ -7018,6 +7378,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" @@ -7030,6 +7396,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" + [[package]] name = "windows_x86_64_msvc" version = "0.42.2" @@ -7042,6 +7414,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" + [[package]] name = "winnow" version = "0.4.6" @@ -7061,6 +7439,16 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "winreg" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + [[package]] name = "workspace_hack" version = "0.1.0" @@ -7090,11 +7478,10 @@ dependencies = [ "futures-sink", "futures-util", "getrandom 0.2.11", - "hashbrown 0.13.2", "hashbrown 0.14.0", "hex", "hmac", - "hyper", + "hyper 0.14.26", "indexmap 1.9.3", "itertools", "libc", @@ -7111,8 +7498,9 @@ dependencies = [ "regex", "regex-automata 0.4.3", "regex-syntax 0.8.2", - "reqwest", - "rustls 0.21.9", + "reqwest 0.11.19", + "reqwest 0.12.4", + "rustls 0.21.11", "scopeguard", "serde", "serde_json", @@ -7121,6 +7509,7 @@ dependencies = [ "subtle", "syn 1.0.109", "syn 2.0.52", + "sync_wrapper", "time", "time-macros", "tokio", @@ -7132,7 +7521,6 @@ dependencies = [ "tower", "tracing", "tracing-core", - "tungstenite", "url", "uuid", "zeroize", diff --git a/Cargo.toml b/Cargo.toml index 3c6077648e..a6d406dc2f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,10 +44,11 @@ license = "Apache-2.0" anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } -azure_core = "0.18" -azure_identity = "0.18" -azure_storage = "0.18" -azure_storage_blobs = "0.18" +atomic-take = "1.1.0" +azure_core = "0.19" +azure_identity = "0.19" +azure_storage = "0.19" +azure_storage_blobs = "0.19" flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" @@ -97,7 +98,7 @@ http-types = { version = "2", default-features = false } humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" -hyper-tungstenite = "0.11" +hyper-tungstenite = "0.13.0" inotify = "0.10.2" ipnet = "2.9.0" itertools = "0.10" @@ -106,7 +107,8 @@ lasso = "0.7" leaky-bucket = "1.0.1" libc = "0.2" md5 = "0.7.0" -measured = { version = "0.0.13", features=["default", "lasso"] } +measured = { version = "0.0.21", features=["lasso"] } +measured-process = { version = "0.0.21" } memoffset = "0.8" native-tls = "0.2" nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } @@ -128,10 +130,10 @@ prost = "0.11" rand = "0.8" redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" -reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } -reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] } -reqwest-middleware = "0.2.0" -reqwest-retry = "0.2.2" +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } +reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] } +reqwest-middleware = "0.3.0" +reqwest-retry = "0.5" routerify = "3" rpds = "0.13" rustc-hash = "1.1.0" @@ -141,7 +143,7 @@ rustls-split = "0.3" scopeguard = "1.1" sysinfo = "0.29.2" sd-notify = "0.4.1" -sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } +sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_path_to_error = "0.1" @@ -155,7 +157,8 @@ socket2 = "0.5" strum = "0.24" strum_macros = "0.24" "subtle" = "2.5.0" -svg_fmt = "0.4.1" +# https://github.com/nical/rust_debug/pull/4 +svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" } sync_wrapper = "0.1.2" tar = "0.4" task-local-extensions = "0.1.4" @@ -174,10 +177,11 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] } toml = "0.7" toml_edit = "0.19" tonic = {version = "0.9", features = ["tls", "tls-roots"]} +tower-service = "0.3.2" tracing = "0.1" tracing-error = "0.2.0" -tracing-opentelemetry = "0.20.0" -tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } +tracing-opentelemetry = "0.21.0" +tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] } twox-hash = { version = "1.6.3", default-features = false } url = "2.2" urlencoding = "2.1" diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 1ed6f87473..19739cc1f8 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -58,8 +58,14 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$ && mv protoc/include/google /usr/local/include/google \ && rm -rf protoc.zip protoc +# s5cmd +ENV S5CMD_VERSION=2.2.2 +RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \ + && chmod +x s5cmd \ + && mv s5cmd /usr/local/bin/s5cmd + # LLVM -ENV LLVM_VERSION=17 +ENV LLVM_VERSION=18 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && apt update \ @@ -135,7 +141,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.77.0 +ENV RUSTC_VERSION=1.78.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ diff --git a/Makefile b/Makefile index f13f080f1a..5e2b3c4367 100644 --- a/Makefile +++ b/Makefile @@ -25,14 +25,16 @@ ifeq ($(UNAME_S),Linux) # Seccomp BPF is only available for Linux PG_CONFIGURE_OPTS += --with-libseccomp else ifeq ($(UNAME_S),Darwin) - # macOS with brew-installed openssl requires explicit paths - # It can be configured with OPENSSL_PREFIX variable - OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3) - PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib - PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig - # macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure - # brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage - EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/: + ifndef DISABLE_HOMEBREW + # macOS with brew-installed openssl requires explicit paths + # It can be configured with OPENSSL_PREFIX variable + OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3) + PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib + PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig + # macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure + # brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage + EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/: + endif endif # Use -C option so that when PostgreSQL "make install" installs the diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 117919786e..67c5250376 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -47,7 +47,7 @@ use chrono::Utc; use clap::Arg; use signal_hook::consts::{SIGQUIT, SIGTERM}; use signal_hook::{consts::SIGINT, iterator::Signals}; -use tracing::{error, info}; +use tracing::{error, info, warn}; use url::Url; use compute_api::responses::ComputeStatus; @@ -62,6 +62,7 @@ use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::spec::*; +use compute_tools::swap::resize_swap; // this is an arbitrary build tag. Fine as a default / for testing purposes // in-case of not-set environment var @@ -110,6 +111,7 @@ fn main() -> Result<()> { .expect("Postgres connection string is required"); let spec_json = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); + let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind"); // Extract OpenTelemetry context for the startup actions from the // TRACEPARENT and TRACESTATE env variables, and attach it to the current @@ -226,14 +228,14 @@ fn main() -> Result<()> { // If this is a pooled VM, prewarm before starting HTTP server and becoming // available for binding. Prewarming helps Postgres start quicker later, - // because QEMU will already have it's memory allocated from the host, and + // because QEMU will already have its memory allocated from the host, and // the necessary binaries will already be cached. if !spec_set { compute.prewarm_postgres()?; } - // Launch http service first, so we were able to serve control-plane - // requests, while configuration is still in progress. + // Launch http service first, so that we can serve control-plane requests + // while configuration is still in progress. let _http_handle = launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread"); @@ -253,21 +255,22 @@ fn main() -> Result<()> { break; } } + + // Record for how long we slept waiting for the spec. + let now = Utc::now(); + state.metrics.wait_for_spec_ms = now + .signed_duration_since(state.start_time) + .to_std() + .unwrap() + .as_millis() as u64; + + // Reset start time, so that the total startup time that is calculated later will + // not include the time that we waited for the spec. + state.start_time = now; } // We got all we need, update the state. let mut state = compute.state.lock().unwrap(); - - // Record for how long we slept waiting for the spec. - state.metrics.wait_for_spec_ms = Utc::now() - .signed_duration_since(state.start_time) - .to_std() - .unwrap() - .as_millis() as u64; - // Reset start time to the actual start of the configuration, so that - // total startup time was properly measured at the end. - state.start_time = Utc::now(); - state.status = ComputeStatus::Init; compute.state_changed.notify_all(); @@ -275,33 +278,72 @@ fn main() -> Result<()> { "running compute with features: {:?}", state.pspec.as_ref().unwrap().spec.features ); + // before we release the mutex, fetch the swap size (if any) for later. + let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes; drop(state); // Launch remaining service threads let _monitor_handle = launch_monitor(&compute); let _configurator_handle = launch_configurator(&compute); - // Start Postgres + let mut prestartup_failed = false; let mut delay_exit = false; - let mut exit_code = None; - let pg = match compute.start_compute(extension_server_port) { - Ok(pg) => Some(pg), - Err(err) => { - error!("could not start the compute node: {:#}", err); - let mut state = compute.state.lock().unwrap(); - state.error = Some(format!("{:?}", err)); - state.status = ComputeStatus::Failed; - // Notify others that Postgres failed to start. In case of configuring the - // empty compute, it's likely that API handler is still waiting for compute - // state change. With this we will notify it that compute is in Failed state, - // so control plane will know about it earlier and record proper error instead - // of timeout. - compute.state_changed.notify_all(); - drop(state); // unlock - delay_exit = true; - None + + // Resize swap to the desired size if the compute spec says so + if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) { + // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion + // *before* starting postgres. + // + // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this + // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets + // OOM-killed during startup because swap wasn't available yet. + match resize_swap(size_bytes) { + Ok(()) => { + let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. + info!(%size_bytes, %size_gib, "resized swap"); + } + Err(err) => { + let err = err.context("failed to resize swap"); + error!("{err:#}"); + + // Mark compute startup as failed; don't try to start postgres, and report this + // error to the control plane when it next asks. + prestartup_failed = true; + let mut state = compute.state.lock().unwrap(); + state.error = Some(format!("{err:?}")); + state.status = ComputeStatus::Failed; + compute.state_changed.notify_all(); + delay_exit = true; + } } - }; + } + + // Start Postgres + let mut pg = None; + let mut exit_code = None; + + if !prestartup_failed { + pg = match compute.start_compute(extension_server_port) { + Ok(pg) => Some(pg), + Err(err) => { + error!("could not start the compute node: {:#}", err); + let mut state = compute.state.lock().unwrap(); + state.error = Some(format!("{:?}", err)); + state.status = ComputeStatus::Failed; + // Notify others that Postgres failed to start. In case of configuring the + // empty compute, it's likely that API handler is still waiting for compute + // state change. With this we will notify it that compute is in Failed state, + // so control plane will know about it earlier and record proper error instead + // of timeout. + compute.state_changed.notify_all(); + drop(state); // unlock + delay_exit = true; + None + } + }; + } else { + warn!("skipping postgres startup because pre-startup step failed"); + } // Start the vm-monitor if directed to. The vm-monitor only runs on linux // because it requires cgroups. @@ -526,6 +568,11 @@ fn cli() -> clap::Command { ) .value_name("FILECACHE_CONNSTR"), ) + .arg( + Arg::new("resize-swap-on-bind") + .long("resize-swap-on-bind") + .action(clap::ArgAction::SetTrue), + ) } /// When compute_ctl is killed, send also termination signal to sync-safekeepers diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 88dc4aca2b..40060f4117 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -818,9 +818,15 @@ impl ComputeNode { Client::connect(zenith_admin_connstr.as_str(), NoTls) .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; // Disable forwarding so that users don't get a cloud_admin role - client.simple_query("SET neon.forward_ddl = false")?; - client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; - client.simple_query("GRANT zenith_admin TO cloud_admin")?; + + let mut func = || { + client.simple_query("SET neon.forward_ddl = false")?; + client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; + client.simple_query("GRANT zenith_admin TO cloud_admin")?; + Ok::<_, anyhow::Error>(()) + }; + func().context("apply_config setup cloud_admin")?; + drop(client); // reconnect with connstring with expected name @@ -832,24 +838,29 @@ impl ComputeNode { }; // Disable DDL forwarding because control plane already knows about these roles/databases. - client.simple_query("SET neon.forward_ddl = false")?; + client + .simple_query("SET neon.forward_ddl = false") + .context("apply_config SET neon.forward_ddl = false")?; // Proceed with post-startup configuration. Note, that order of operations is important. let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; - create_neon_superuser(spec, &mut client)?; - cleanup_instance(&mut client)?; - handle_roles(spec, &mut client)?; - handle_databases(spec, &mut client)?; - handle_role_deletions(spec, connstr.as_str(), &mut client)?; + create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?; + cleanup_instance(&mut client).context("apply_config cleanup_instance")?; + handle_roles(spec, &mut client).context("apply_config handle_roles")?; + handle_databases(spec, &mut client).context("apply_config handle_databases")?; + handle_role_deletions(spec, connstr.as_str(), &mut client) + .context("apply_config handle_role_deletions")?; handle_grants( spec, &mut client, connstr.as_str(), self.has_feature(ComputeFeature::AnonExtension), - )?; - handle_extensions(spec, &mut client)?; - handle_extension_neon(&mut client)?; - create_availability_check_data(&mut client)?; + ) + .context("apply_config handle_grants")?; + handle_extensions(spec, &mut client).context("apply_config handle_extensions")?; + handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?; + create_availability_check_data(&mut client) + .context("apply_config create_availability_check_data")?; // 'Close' connection drop(client); @@ -857,7 +868,7 @@ impl ComputeNode { // Run migrations separately to not hold up cold starts thread::spawn(move || { let mut client = Client::connect(connstr.as_str(), NoTls)?; - handle_migrations(&mut client) + handle_migrations(&mut client).context("apply_config handle_migrations") }); Ok(()) } diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index f1fd8637f5..89c866b20c 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,8 +6,8 @@ use std::path::Path; use anyhow::Result; use crate::pg_helpers::escape_conf_value; -use crate::pg_helpers::PgOptionsSerialize; -use compute_api::spec::{ComputeMode, ComputeSpec}; +use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize}; +use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -92,6 +92,27 @@ pub fn write_postgres_conf( } } + if cfg!(target_os = "linux") { + // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is + // disabled), then the control plane has enabled swap and we should set + // dynamic_shared_memory_type = 'mmap'. + // + // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047. + let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory") + // ignore any errors - they may be expected to occur under certain situations (e.g. when + // not running in Linux). + .unwrap_or_else(|_| String::new()); + if overcommit_memory_contents.trim() == "2" { + let opt = GenericOption { + name: "dynamic_shared_memory_type".to_owned(), + value: Some("mmap".to_owned()), + vartype: "enum".to_owned(), + }; + + write!(file, "{}", opt.to_pg_setting())?; + } + } + // If there are any extra options in the 'settings' field, append those if spec.cluster.settings.is_some() { writeln!(file, "# Managed by compute_ctl: begin")?; diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 4e01ffd954..eac808385c 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -14,4 +14,5 @@ pub mod monitor; pub mod params; pub mod pg_helpers; pub mod spec; +pub mod swap; pub mod sync_sk; diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 5deb50d6b7..fa0822748b 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String { format!("'{}'", res) } -trait GenericOptionExt { +pub trait GenericOptionExt { fn to_pg_option(&self) -> String; fn to_pg_setting(&self) -> String; } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 5643634633..3a6e18b638 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -2,7 +2,7 @@ use std::fs::File; use std::path::Path; use std::str::FromStr; -use anyhow::{anyhow, bail, Result}; +use anyhow::{anyhow, bail, Context, Result}; use postgres::config::Config; use postgres::{Client, NoTls}; use reqwest::StatusCode; @@ -490,7 +490,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { "rename_db" => { let new_name = op.new_name.as_ref().unwrap(); - if existing_dbs.get(&op.name).is_some() { + if existing_dbs.contains_key(&op.name) { let query: String = format!( "ALTER DATABASE {} RENAME TO {}", op.name.pg_quote(), @@ -698,7 +698,8 @@ pub fn handle_grants( // it is important to run this after all grants if enable_anon_extension { - handle_extension_anon(spec, &db.owner, &mut db_client, false)?; + handle_extension_anon(spec, &db.owner, &mut db_client, false) + .context("handle_grants handle_extension_anon")?; } } @@ -813,28 +814,36 @@ $$;"#, // Add new migrations below. ]; - let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; - client.simple_query(query)?; + let mut func = || { + let query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; + client.simple_query(query)?; - query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; - client.simple_query(query)?; + let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; + client.simple_query(query)?; - query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; - client.simple_query(query)?; + let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; + client.simple_query(query)?; - query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; - client.simple_query(query)?; + let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; + client.simple_query(query)?; - query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; - client.simple_query(query)?; + let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; + client.simple_query(query)?; + Ok::<_, anyhow::Error>(()) + }; + func().context("handle_migrations prepare")?; - query = "SELECT id FROM neon_migration.migration_id"; - let row = client.query_one(query, &[])?; + let query = "SELECT id FROM neon_migration.migration_id"; + let row = client + .query_one(query, &[]) + .context("handle_migrations get migration_id")?; let mut current_migration: usize = row.get::<&str, i64>("id") as usize; let starting_migration_id = current_migration; - query = "BEGIN"; - client.simple_query(query)?; + let query = "BEGIN"; + client + .simple_query(query) + .context("handle_migrations begin")?; while current_migration < migrations.len() { let migration = &migrations[current_migration]; @@ -842,7 +851,9 @@ $$;"#, info!("Skip migration id={}", current_migration); } else { info!("Running migration:\n{}\n", migration); - client.simple_query(migration)?; + client.simple_query(migration).with_context(|| { + format!("handle_migrations current_migration={}", current_migration) + })?; } current_migration += 1; } @@ -850,10 +861,14 @@ $$;"#, "UPDATE neon_migration.migration_id SET id={}", migrations.len() ); - client.simple_query(&setval)?; + client + .simple_query(&setval) + .context("handle_migrations update id")?; - query = "COMMIT"; - client.simple_query(query)?; + let query = "COMMIT"; + client + .simple_query(query) + .context("handle_migrations commit")?; info!( "Ran {} migrations", diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs new file mode 100644 index 0000000000..c22b6bc14e --- /dev/null +++ b/compute_tools/src/swap.rs @@ -0,0 +1,36 @@ +use anyhow::{anyhow, Context}; +use tracing::warn; + +pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap"; + +pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> { + // run `/neonvm/bin/resize-swap --once {size_bytes}` + // + // Passing '--once' causes resize-swap to delete itself after successful completion, which + // means that if compute_ctl restarts later, we won't end up calling 'swapoff' while + // postgres is running. + // + // NOTE: resize-swap is not very clever. If present, --once MUST be the first arg. + let child_result = std::process::Command::new("/usr/bin/sudo") + .arg(RESIZE_SWAP_BIN) + .arg("--once") + .arg(size_bytes.to_string()) + .spawn(); + + if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) { + warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running"); + return Ok(()); + } + + child_result + .context("spawn() failed") + .and_then(|mut child| child.wait().context("wait() failed")) + .and_then(|status| match status.success() { + true => Ok(()), + false => Err(anyhow!("process exited with {status}")), + }) + // wrap any prior error with the overall context that we couldn't run the command + .with_context(|| { + format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`") + }) +} diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index b544a8c587..2ce041068e 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -17,6 +17,7 @@ nix.workspace = true once_cell.workspace = true postgres.workspace = true hex.workspace = true +humantime-serde.workspace = true hyper.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["blocking", "json"] } diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 56495dd2da..e01d5c9799 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -14,15 +14,15 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR}; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage_controller::StorageController; use control_plane::{broker, local_env}; +use pageserver_api::config::{ + DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, + DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, +}; use pageserver_api::controller_api::PlacementPolicy; use pageserver_api::models::{ ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo, }; use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; -use pageserver_api::{ - DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, - DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, -}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; use safekeeper_api::{ @@ -417,6 +417,54 @@ async fn handle_tenant( println!("{} {:?}", t.id, t.state); } } + Some(("import", import_match)) => { + let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate); + + let storage_controller = StorageController::from_env(env); + let create_response = storage_controller.tenant_import(tenant_id).await?; + + let shard_zero = create_response + .shards + .first() + .expect("Import response omitted shards"); + + let attached_pageserver_id = shard_zero.node_id; + let pageserver = + PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?); + + println!( + "Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}" + ); + + let timelines = pageserver + .http_client + .list_timelines(shard_zero.shard_id) + .await?; + + // Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names + let main_timeline = timelines + .iter() + .find(|t| t.ancestor_timeline_id.is_none()) + .expect("No timelines found") + .timeline_id; + + let mut branch_i = 0; + for timeline in timelines.iter() { + let branch_name = if timeline.timeline_id == main_timeline { + "main".to_string() + } else { + branch_i += 1; + format!("branch_{branch_i}") + }; + + println!( + "Importing timeline {tenant_id}/{} as branch {branch_name}", + timeline.timeline_id + ); + + env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?; + } + } Some(("create", create_match)) => { let tenant_conf: HashMap<_, _> = create_match .get_many::("config") @@ -789,6 +837,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .copied() .unwrap_or(false); + let allow_multiple = sub_args.get_flag("allow-multiple"); + let mode = match (lsn, hot_standby) { (Some(lsn), false) => ComputeMode::Static(lsn), (None, true) => ComputeMode::Replica, @@ -806,7 +856,9 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re _ => {} } - cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?; + if !allow_multiple { + cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?; + } cplane.new_endpoint( &endpoint_id, @@ -835,6 +887,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re let remote_ext_config = sub_args.get_one::("remote-ext-config"); + let allow_multiple = sub_args.get_flag("allow-multiple"); + // If --safekeepers argument is given, use only the listed safekeeper nodes. let safekeepers = if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { @@ -860,11 +914,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .cloned() .unwrap_or_default(); - cplane.check_conflicting_endpoints( - endpoint.mode, - endpoint.tenant_id, - endpoint.timeline_id, - )?; + if !allow_multiple { + cplane.check_conflicting_endpoints( + endpoint.mode, + endpoint.tenant_id, + endpoint.timeline_id, + )?; + } let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id { let conf = env.get_pageserver_conf(pageserver_id).unwrap(); @@ -1231,7 +1287,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { match ComputeControlPlane::load(env.clone()) { Ok(cplane) => { for (_k, node) in cplane.endpoints { - if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) { + if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) { eprintln!("postgres stop failed: {e:#}"); } } @@ -1396,6 +1452,12 @@ fn cli() -> Command { .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`") .required(false); + let allow_multiple = Arg::new("allow-multiple") + .help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.") + .long("allow-multiple") + .action(ArgAction::SetTrue) + .required(false); + Command::new("Neon CLI") .arg_required_else_help(true) .version(GIT_VERSION) @@ -1417,6 +1479,7 @@ fn cli() -> Command { .subcommand( Command::new("timeline") .about("Manage timelines") + .arg_required_else_help(true) .subcommand(Command::new("list") .about("List all timelines, available to this pageserver") .arg(tenant_id_arg.clone())) @@ -1479,6 +1542,8 @@ fn cli() -> Command { .subcommand(Command::new("config") .arg(tenant_id_arg.clone()) .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))) + .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true)) + .about("Import a tenant that is present in remote storage, and create branches for its timelines")) ) .subcommand( Command::new("pageserver") @@ -1503,8 +1568,8 @@ fn cli() -> Command { Command::new("storage_controller") .arg_required_else_help(true) .about("Manage storage_controller") - .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone())) - .subcommand(Command::new("stop").about("Stop local pageserver") + .subcommand(Command::new("start").about("Start storage controller")) + .subcommand(Command::new("stop").about("Stop storage controller") .arg(stop_mode_arg.clone())) ) .subcommand( @@ -1550,6 +1615,7 @@ fn cli() -> Command { .arg(pg_version_arg.clone()) .arg(hot_standby_arg.clone()) .arg(update_catalog) + .arg(allow_multiple.clone()) ) .subcommand(Command::new("start") .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") @@ -1558,6 +1624,7 @@ fn cli() -> Command { .arg(safekeepers_arg) .arg(remote_ext_config_args) .arg(create_test_user) + .arg(allow_multiple.clone()) ) .subcommand(Command::new("reconfigure") .about("Reconfigure the endpoint") diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 03f7db99fb..20371e1cb8 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -554,6 +554,7 @@ impl Endpoint { format_version: 1.0, operation_uuid: None, features: self.features.clone(), + swap_size_bytes: None, cluster: Cluster { cluster_id: None, // project ID: not used name: None, // project name: not used diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index bd3dbef453..6437d04ec8 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -17,6 +17,7 @@ use std::net::Ipv4Addr; use std::net::SocketAddr; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; +use std::time::Duration; use utils::{ auth::{encode_from_key_file, Claims}, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, @@ -66,6 +67,10 @@ pub struct LocalEnv { pub broker: NeonBroker, + // Configuration for the storage controller (1 per neon_local environment) + #[serde(default)] + pub storage_controller: NeonStorageControllerConf, + /// This Vec must always contain at least one pageserver pub pageservers: Vec, @@ -98,6 +103,29 @@ pub struct NeonBroker { pub listen_addr: SocketAddr, } +/// Broker config for cluster internal communication. +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +#[serde(default)] +pub struct NeonStorageControllerConf { + /// Heartbeat timeout before marking a node offline + #[serde(with = "humantime_serde")] + pub max_unavailable: Duration, +} + +impl NeonStorageControllerConf { + // Use a shorter pageserver unavailability interval than the default to speed up tests. + const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = + std::time::Duration::from_secs(10); +} + +impl Default for NeonStorageControllerConf { + fn default() -> Self { + Self { + max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL, + } + } +} + // Dummy Default impl to satisfy Deserialize derive. impl Default for NeonBroker { fn default() -> Self { @@ -129,6 +157,8 @@ pub struct PageServerConf { pub(crate) virtual_file_io_engine: Option, pub(crate) get_vectored_impl: Option, + pub(crate) get_impl: Option, + pub(crate) validate_vectored_get: Option, } impl Default for PageServerConf { @@ -141,6 +171,8 @@ impl Default for PageServerConf { http_auth_type: AuthType::Trust, virtual_file_io_engine: None, get_vectored_impl: None, + get_impl: None, + validate_vectored_get: None, } } } @@ -156,6 +188,7 @@ pub struct SafekeeperConf { pub remote_storage: Option, pub backup_threads: Option, pub auth_enabled: bool, + pub listen_addr: Option, } impl Default for SafekeeperConf { @@ -169,6 +202,7 @@ impl Default for SafekeeperConf { remote_storage: None, backup_threads: None, auth_enabled: false, + listen_addr: None, } } } @@ -348,7 +382,10 @@ impl LocalEnv { // Find neon binaries. if env.neon_distrib_dir == Path::new("") { - env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); + env::current_exe()? + .parent() + .unwrap() + .clone_into(&mut env.neon_distrib_dir); } if env.pageservers.is_empty() { diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index abf815f07a..1a64391306 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -92,6 +92,8 @@ impl PageServerNode { http_auth_type, virtual_file_io_engine, get_vectored_impl, + get_impl, + validate_vectored_get, } = &self.conf; let id = format!("id={}", id); @@ -111,6 +113,16 @@ impl PageServerNode { } else { String::new() }; + let get_impl = if let Some(get_impl) = get_impl { + format!("get_impl='{get_impl}'") + } else { + String::new() + }; + let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get { + format!("validate_vectored_get={validate_vectored_get}") + } else { + String::new() + }; let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); @@ -124,6 +136,8 @@ impl PageServerNode { broker_endpoint_param, virtual_file_io_engine, get_vectored_impl, + get_impl, + validate_vectored_get, ]; if let Some(control_plane_api) = &self.env.control_plane_api { @@ -234,12 +248,13 @@ impl PageServerNode { // situation: the metadata is written by some other script. std::fs::write( metadata_path, - serde_json::to_vec(&serde_json::json!({ - "host": "localhost", - "port": self.pg_connection_config.port(), - "http_host": "localhost", - "http_port": http_port, - })) + serde_json::to_vec(&pageserver_api::config::NodeMetadata { + postgres_host: "localhost".to_string(), + postgres_port: self.pg_connection_config.port(), + http_host: "localhost".to_string(), + http_port, + other: HashMap::new(), + }) .unwrap(), ) .expect("Failed to write metadata file"); @@ -434,6 +449,11 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `timeline_get_throttle` from json")?, + switch_to_aux_file_v2: settings + .remove("switch_to_aux_file_v2") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'switch_to_aux_file_v2' as bool")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -552,6 +572,11 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `timeline_get_throttle` from json")?, + switch_to_aux_file_v2: settings + .remove("switch_to_aux_file_v2") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'switch_to_aux_file_v2' as bool")?, } }; diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 6ac71dfe51..d62a2e80b5 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -70,24 +70,31 @@ pub struct SafekeeperNode { pub pg_connection_config: PgConnectionConfig, pub env: LocalEnv, pub http_client: reqwest::Client, + pub listen_addr: String, pub http_base_url: String, } impl SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { + let listen_addr = if let Some(ref listen_addr) = conf.listen_addr { + listen_addr.clone() + } else { + "127.0.0.1".to_string() + }; SafekeeperNode { id: conf.id, conf: conf.clone(), - pg_connection_config: Self::safekeeper_connection_config(conf.pg_port), + pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port), env: env.clone(), http_client: reqwest::Client::new(), - http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port), + http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port), + listen_addr, } } /// Construct libpq connection string for connecting to this safekeeper. - fn safekeeper_connection_config(port: u16) -> PgConnectionConfig { - PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port) + fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig { + PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port) } pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { @@ -111,8 +118,8 @@ impl SafekeeperNode { ); io::stdout().flush().unwrap(); - let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port); - let listen_http = format!("127.0.0.1:{}", self.conf.http_port); + let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port); + let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port); let id = self.id; let datadir = self.datadir_path(); @@ -139,7 +146,7 @@ impl SafekeeperNode { availability_zone, ]; if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port { - let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port); + let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port); args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]); } if !self.conf.sync { diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 7f2b973391..f1c43f4036 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -1,6 +1,8 @@ -use crate::{background_process, local_env::LocalEnv}; +use crate::{ + background_process, + local_env::{LocalEnv, NeonStorageControllerConf}, +}; use camino::{Utf8Path, Utf8PathBuf}; -use hyper::Method; use pageserver_api::{ controller_api::{ NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse, @@ -14,6 +16,7 @@ use pageserver_api::{ }; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; +use reqwest::Method; use serde::{de::DeserializeOwned, Deserialize, Serialize}; use std::{fs, str::FromStr}; use tokio::process::Command; @@ -32,15 +35,13 @@ pub struct StorageController { public_key: Option, postgres_port: u16, client: reqwest::Client, + config: NeonStorageControllerConf, } const COMMAND: &str = "storage_controller"; const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16; -// Use a shorter pageserver unavailability interval than the default to speed up tests. -const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10); - #[derive(Serialize, Deserialize)] pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, @@ -135,6 +136,7 @@ impl StorageController { client: reqwest::ClientBuilder::new() .build() .expect("Failed to construct http client"), + config: env.storage_controller.clone(), } } @@ -272,8 +274,6 @@ impl StorageController { // Run migrations on every startup, in case something changed. let database_url = self.setup_database().await?; - let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into(); - let mut args = vec![ "-l", &self.listen, @@ -283,7 +283,7 @@ impl StorageController { "--database-url", &database_url, "--max-unavailable-interval", - &max_unavailable.to_string(), + &humantime::Duration::from(self.config.max_unavailable).to_string(), ] .into_iter() .map(|s| s.to_string()) @@ -379,7 +379,7 @@ impl StorageController { /// Simple HTTP request wrapper for calling into storage controller async fn dispatch( &self, - method: hyper::Method, + method: reqwest::Method, path: String, body: Option, ) -> anyhow::Result @@ -472,6 +472,16 @@ impl StorageController { .await } + #[instrument(skip(self))] + pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result { + self.dispatch::<(), TenantCreateResponse>( + Method::POST, + format!("debug/v1/tenant/{tenant_id}/import"), + None, + ) + .await + } + #[instrument(skip(self))] pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result { self.dispatch::<(), _>( diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 2edd09eac1..c19bc96cdb 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -1,20 +1,19 @@ -use std::{collections::HashMap, str::FromStr}; +use std::{collections::HashMap, str::FromStr, time::Duration}; use clap::{Parser, Subcommand}; -use hyper::Method; use pageserver_api::{ controller_api::{ NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantDescribeResponse, TenantPolicyRequest, }, models::{ - ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest, - TenantShardSplitRequest, TenantShardSplitResponse, + LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest, + TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse, }, shard::{ShardStripeSize, TenantShardId}, }; use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; -use reqwest::Url; +use reqwest::{Method, StatusCode, Url}; use serde::{de::DeserializeOwned, Serialize}; use utils::id::{NodeId, TenantId}; @@ -120,6 +119,12 @@ enum Command { #[arg(long)] tenant_id: TenantId, }, + /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary + /// mode so that it can warm up content on a pageserver. + TenantWarmup { + #[arg(long)] + tenant_id: TenantId, + }, } #[derive(Parser)] @@ -226,7 +231,7 @@ impl Client { /// Simple HTTP request wrapper for calling into storage controller async fn dispatch( &self, - method: hyper::Method, + method: Method, path: String, body: Option, ) -> mgmt_api::Result @@ -581,6 +586,94 @@ async fn main() -> anyhow::Result<()> { } println!("{table}"); } + Command::TenantWarmup { tenant_id } => { + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await; + match describe_response { + Ok(describe) => { + if matches!(describe.policy, PlacementPolicy::Secondary) { + // Fine: it's already known to controller in secondary mode: calling + // again to put it into secondary mode won't cause problems. + } else { + anyhow::bail!("Tenant already present with policy {:?}", describe.policy); + } + } + Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => { + // Fine: this tenant isn't know to the storage controller yet. + } + Err(e) => { + // Unexpected API error + return Err(e.into()); + } + } + + vps_client + .location_config( + TenantShardId::unsharded(tenant_id), + pageserver_api::models::LocationConfig { + mode: pageserver_api::models::LocationConfigMode::Secondary, + generation: None, + secondary_conf: Some(LocationConfigSecondary { warm: true }), + shard_number: 0, + shard_count: 0, + shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0, + tenant_conf: TenantConfig::default(), + }, + None, + true, + ) + .await?; + + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await?; + + let secondary_ps_id = describe_response + .shards + .first() + .unwrap() + .node_secondary + .first() + .unwrap(); + + println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}"); + loop { + let (status, progress) = vps_client + .tenant_secondary_download( + TenantShardId::unsharded(tenant_id), + Some(Duration::from_secs(10)), + ) + .await?; + println!( + "Progress: {}/{} layers, {}/{} bytes", + progress.layers_downloaded, + progress.layers_total, + progress.bytes_downloaded, + progress.bytes_total + ); + match status { + StatusCode::OK => { + println!("Download complete"); + break; + } + StatusCode::ACCEPTED => { + // Loop + } + _ => { + anyhow::bail!("Unexpected download status: {status}"); + } + } + } + } } Ok(()) diff --git a/docs/storage_controller.md b/docs/storage_controller.md new file mode 100644 index 0000000000..daf4d0c8b7 --- /dev/null +++ b/docs/storage_controller.md @@ -0,0 +1,150 @@ +# Storage Controller + +## Concepts + +The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller, +which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations). + +It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding +the underlying details of how data is spread across multiple nodes. + +The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent. + +## APIs + +The storage controller’s HTTP server implements four logically separate APIs: + +- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver. +- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits. +- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system. +- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers + to ensure data safety with generation numbers. + +The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs). + +See the `http.rs` file in the source for where the HTTP APIs are implemented. + +## Database + +The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not +persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and +rebuilt on startup. + +The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why. + +The `diesel` crate is used for defining models & migrations. + +Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database. + +### Diesel tip: migrations + +If you need to modify the database schema, here’s how to create a migration: + +- Install the diesel CLI with `cargo install diesel_cli` +- Use `diesel migration generate ` to create a new migration +- Populate the SQL files in the `migrations/` subdirectory +- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically. + - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service` +- Commit the migration files and the changes to schema.rs +- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again. +- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed. + +## storcon_cli + +The `storcon_cli` tool enables interactive management of the storage controller. This is usually +only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline). + +`storcon_cli --help` includes details on commands. + +# Deploying + +This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as +part of a self-hosted system. + +_General note: since the default `neon_local` environment includes a storage controller, this is a useful +reference when figuring out deployment._ + +## Database + +It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral +local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver. + +The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte. + +Set the URL to the database using the `--database-url` CLI option. + +There is no need to run migrations manually: the storage controller automatically applies migrations +when it starts up. + +## Configure pageservers to use the storage controller + +1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should + point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters. +2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself + with the storage controller when it starts up. See the example below for the format of this file. + +### Example `metadata.json` + +``` +{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000} +``` + +- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever + postgres runs. +- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where + the storage controller runs. + +## Handle compute notifications. + +The storage controller independently moves tenant attachments between pageservers in response to +changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable +postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver +location changes. + +The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires +JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request. + +In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems +the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling +the compute hook. + +When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated: +the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience. + +``` +struct ComputeHookNotifyRequestShard { + node_id: NodeId, + shard_number: ShardNumber, +} + +struct ComputeHookNotifyRequest { + tenant_id: TenantId, + stripe_size: Option, + shards: Vec, +} +``` + +When a notification is received: + +1. Modify postgres configuration for this tenant: + + - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The + shards identified by `NodeId` must be converted to the address+port of the node. + - if stripe_size is not None, set `neon.stripe_size` to this value + +2. Send SIGHUP to postgres to reload configuration +3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller + will retry the notification until it succeeds.. + +### Example notification body + +``` +{ + "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc", + "stripe_size": 32768, + "shards": [ + {"node_id": 344, "shard_number": 0}, + {"node_id": 722, "shard_number": 1}, + ], +} +``` diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 71ae66c45c..1c4ee2089f 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -33,6 +33,23 @@ pub struct ComputeSpec { #[serde(default)] pub features: Vec, + /// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs + /// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first + /// received. + /// + /// Both this field and `--resize-swap-on-bind` are required, so that the control plane's + /// spec generation doesn't need to be aware of the actual compute it's running on, while + /// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could + /// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus + /// giving every VM much more swap than it should have (32GiB). + /// + /// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for + /// enabling the swap resizing behavior once rollout is complete. + /// + /// See neondatabase/cloud#12047 for more. + #[serde(default)] + pub swap_size_bytes: Option, + /// Expected cluster state at the end of transition process. pub cluster: Cluster, pub delta_operations: Option>, diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index f6a49a0166..0bd804051c 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -10,11 +10,13 @@ libc.workspace = true once_cell.workspace = true chrono.workspace = true twox-hash.workspace = true +measured.workspace = true workspace_hack.workspace = true [target.'cfg(target_os = "linux")'.dependencies] procfs.workspace = true +measured-process.workspace = true [dev-dependencies] rand = "0.8" diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs index dfb4461ce9..f53511ab5c 100644 --- a/libs/metrics/src/hll.rs +++ b/libs/metrics/src/hll.rs @@ -7,14 +7,19 @@ //! use significantly less memory than this, but can only approximate the cardinality. use std::{ - collections::HashMap, - hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}, - sync::{atomic::AtomicU8, Arc, RwLock}, + hash::{BuildHasher, BuildHasherDefault, Hash}, + sync::atomic::AtomicU8, }; -use prometheus::{ - core::{self, Describer}, - proto, Opts, +use measured::{ + label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, + metric::{ + group::{Encoding, MetricValue}, + name::MetricNameEncoder, + Metric, MetricType, MetricVec, + }, + text::TextEncoder, + LabelGroup, }; use twox_hash::xxh3; @@ -93,203 +98,25 @@ macro_rules! register_hll { /// ``` /// /// See for estimates on alpha -#[derive(Clone)] -pub struct HyperLogLogVec { - core: Arc>, +pub type HyperLogLogVec = MetricVec, L>; +pub type HyperLogLog = Metric>; + +pub struct HyperLogLogState { + shards: [AtomicU8; N], } - -struct HyperLogLogVecCore { - pub children: RwLock, BuildHasherDefault>>, - pub desc: core::Desc, - pub opts: Opts, -} - -impl core::Collector for HyperLogLogVec { - fn desc(&self) -> Vec<&core::Desc> { - vec![&self.core.desc] - } - - fn collect(&self) -> Vec { - let mut m = proto::MetricFamily::default(); - m.set_name(self.core.desc.fq_name.clone()); - m.set_help(self.core.desc.help.clone()); - m.set_field_type(proto::MetricType::GAUGE); - - let mut metrics = Vec::new(); - for child in self.core.children.read().unwrap().values() { - child.core.collect_into(&mut metrics); - } - m.set_metric(metrics); - - vec![m] +impl Default for HyperLogLogState { + fn default() -> Self { + #[allow(clippy::declare_interior_mutable_const)] + const ZERO: AtomicU8 = AtomicU8::new(0); + Self { shards: [ZERO; N] } } } -impl HyperLogLogVec { - /// Create a new [`HyperLogLogVec`] based on the provided - /// [`Opts`] and partitioned by the given label names. At least one label name must be - /// provided. - pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result { - assert!(N.is_power_of_two()); - let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect(); - let opts = opts.variable_labels(variable_names); - - let desc = opts.describe()?; - let v = HyperLogLogVecCore { - children: RwLock::new(HashMap::default()), - desc, - opts, - }; - - Ok(Self { core: Arc::new(v) }) - } - - /// `get_metric_with_label_values` returns the [`HyperLogLog

`] for the given slice - /// of label values (same order as the VariableLabels in Desc). If that combination of - /// label values is accessed for the first time, a new [`HyperLogLog

`] is created. - /// - /// An error is returned if the number of label values is not the same as the - /// number of VariableLabels in Desc. - pub fn get_metric_with_label_values( - &self, - vals: &[&str], - ) -> prometheus::Result> { - self.core.get_metric_with_label_values(vals) - } - - /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error - /// occurs. - pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog { - self.get_metric_with_label_values(vals).unwrap() - } +impl MetricType for HyperLogLogState { + type Metadata = (); } -impl HyperLogLogVecCore { - pub fn get_metric_with_label_values( - &self, - vals: &[&str], - ) -> prometheus::Result> { - let h = self.hash_label_values(vals)?; - - if let Some(metric) = self.children.read().unwrap().get(&h).cloned() { - return Ok(metric); - } - - self.get_or_create_metric(h, vals) - } - - pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result { - if vals.len() != self.desc.variable_labels.len() { - return Err(prometheus::Error::InconsistentCardinality { - expect: self.desc.variable_labels.len(), - got: vals.len(), - }); - } - - let mut h = xxh3::Hash64::default(); - for val in vals { - h.write(val.as_bytes()); - } - - Ok(h.finish()) - } - - fn get_or_create_metric( - &self, - hash: u64, - label_values: &[&str], - ) -> prometheus::Result> { - let mut children = self.children.write().unwrap(); - // Check exist first. - if let Some(metric) = children.get(&hash).cloned() { - return Ok(metric); - } - - let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?; - children.insert(hash, metric.clone()); - Ok(metric) - } -} - -/// HLL is a probabilistic cardinality measure. -/// -/// How to use this time-series for a metric name `my_metrics_total_hll`: -/// -/// ```promql -/// # harmonic mean -/// 1 / ( -/// sum ( -/// 2 ^ -( -/// # HLL merge operation -/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...) -/// ) -/// ) without (hll_shard) -/// ) -/// * alpha -/// * shards_count -/// * shards_count -/// ``` -/// -/// If you want an estimate over time, you can use the following query: -/// -/// ```promql -/// # harmonic mean -/// 1 / ( -/// sum ( -/// 2 ^ -( -/// # HLL merge operation -/// max ( -/// max_over_time(my_metrics_total_hll{}[$__rate_interval]) -/// ) by (hll_shard, other_labels...) -/// ) -/// ) without (hll_shard) -/// ) -/// * alpha -/// * shards_count -/// * shards_count -/// ``` -/// -/// In the case of low cardinality, you might want to use the linear counting approximation: -/// -/// ```promql -/// # LinearCounting(m, V) = m log (m / V) -/// shards_count * ln(shards_count / -/// # calculate V = how many shards contain a 0 -/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard) -/// ) -/// ``` -/// -/// See for estimates on alpha -#[derive(Clone)] -pub struct HyperLogLog { - core: Arc>, -} - -impl HyperLogLog { - /// Create a [`HyperLogLog`] with the `name` and `help` arguments. - pub fn new, S2: Into>(name: S1, help: S2) -> prometheus::Result { - assert!(N.is_power_of_two()); - let opts = Opts::new(name, help); - Self::with_opts(opts) - } - - /// Create a [`HyperLogLog`] with the `opts` options. - pub fn with_opts(opts: Opts) -> prometheus::Result { - Self::with_opts_and_label_values(&opts, &[]) - } - - fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result { - let desc = opts.describe()?; - let labels = make_label_pairs(&desc, label_values)?; - - let v = HyperLogLogCore { - shards: [0; N].map(AtomicU8::new), - desc, - labels, - }; - Ok(Self { core: Arc::new(v) }) - } - +impl HyperLogLogState { pub fn measure(&self, item: &impl Hash) { // changing the hasher will break compatibility with previous measurements. self.record(BuildHasherDefault::::default().hash_one(item)); @@ -299,42 +126,11 @@ impl HyperLogLog { let p = N.ilog2() as u8; let j = hash & (N as u64 - 1); let rho = (hash >> p).leading_zeros() as u8 + 1 - p; - self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); - } -} - -struct HyperLogLogCore { - shards: [AtomicU8; N], - desc: core::Desc, - labels: Vec, -} - -impl core::Collector for HyperLogLog { - fn desc(&self) -> Vec<&core::Desc> { - vec![&self.core.desc] + self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); } - fn collect(&self) -> Vec { - let mut m = proto::MetricFamily::default(); - m.set_name(self.core.desc.fq_name.clone()); - m.set_help(self.core.desc.help.clone()); - m.set_field_type(proto::MetricType::GAUGE); - - let mut metrics = Vec::new(); - self.core.collect_into(&mut metrics); - m.set_metric(metrics); - - vec![m] - } -} - -impl HyperLogLogCore { - fn collect_into(&self, metrics: &mut Vec) { - self.shards.iter().enumerate().for_each(|(i, x)| { - let mut shard_label = proto::LabelPair::default(); - shard_label.set_name("hll_shard".to_owned()); - shard_label.set_value(format!("{i}")); - + fn take_sample(&self) -> [u8; N] { + self.shards.each_ref().map(|x| { // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus. // This seems like it would be a race condition, @@ -344,85 +140,90 @@ impl HyperLogLogCore { // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window. // this would mean that a dev port-forwarding the metrics url won't break the sampling. - let v = x.swap(0, std::sync::atomic::Ordering::Relaxed); - - let mut m = proto::Metric::default(); - let mut c = proto::Gauge::default(); - c.set_value(v as f64); - m.set_gauge(c); - - let mut labels = Vec::with_capacity(self.labels.len() + 1); - labels.extend_from_slice(&self.labels); - labels.push(shard_label); - - m.set_label(labels); - metrics.push(m); + x.swap(0, std::sync::atomic::Ordering::Relaxed) }) } } - -fn make_label_pairs( - desc: &core::Desc, - label_values: &[&str], -) -> prometheus::Result> { - if desc.variable_labels.len() != label_values.len() { - return Err(prometheus::Error::InconsistentCardinality { - expect: desc.variable_labels.len(), - got: label_values.len(), - }); +impl measured::metric::MetricEncoding> + for HyperLogLogState +{ + fn write_type( + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + enc.write_type(&name, measured::text::MetricType::Gauge) } + fn collect_into( + &self, + _: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + struct I64(i64); + impl LabelValue for I64 { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0) + } + } - let total_len = desc.variable_labels.len() + desc.const_label_pairs.len(); - if total_len == 0 { - return Ok(vec![]); - } + struct HllShardLabel { + hll_shard: i64, + } - if desc.variable_labels.is_empty() { - return Ok(desc.const_label_pairs.clone()); - } + impl LabelGroup for HllShardLabel { + fn visit_values(&self, v: &mut impl LabelGroupVisitor) { + const LE: &LabelName = LabelName::from_str("hll_shard"); + v.write_value(LE, &I64(self.hll_shard)); + } + } - let mut label_pairs = Vec::with_capacity(total_len); - for (i, n) in desc.variable_labels.iter().enumerate() { - let mut label_pair = proto::LabelPair::default(); - label_pair.set_name(n.clone()); - label_pair.set_value(label_values[i].to_owned()); - label_pairs.push(label_pair); + self.take_sample() + .into_iter() + .enumerate() + .try_for_each(|(hll_shard, val)| { + enc.write_metric_value( + name.by_ref(), + labels.by_ref().compose_with(HllShardLabel { + hll_shard: hll_shard as i64, + }), + MetricValue::Int(val as i64), + ) + }) } - - for label_pair in &desc.const_label_pairs { - label_pairs.push(label_pair.clone()); - } - label_pairs.sort(); - Ok(label_pairs) } #[cfg(test)] mod tests { use std::collections::HashSet; - use prometheus::{proto, Opts}; + use measured::{label::StaticLabelSet, FixedCardinalityLabel}; use rand::{rngs::StdRng, Rng, SeedableRng}; use rand_distr::{Distribution, Zipf}; use crate::HyperLogLogVec; - fn collect(hll: &HyperLogLogVec<32>) -> Vec { - let mut metrics = vec![]; - hll.core - .children - .read() - .unwrap() - .values() - .for_each(|c| c.core.collect_into(&mut metrics)); - metrics + #[derive(FixedCardinalityLabel, Clone, Copy)] + #[label(singleton = "x")] + enum Label { + A, + B, } - fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 { + + fn collect(hll: &HyperLogLogVec, 32>) -> ([u8; 32], [u8; 32]) { + // cannot go through the `hll.collect_family_into` interface yet... + // need to see if I can fix the conflicting impls problem in measured. + ( + hll.get_metric(hll.with_labels(Label::A)).take_sample(), + hll.get_metric(hll.with_labels(Label::B)).take_sample(), + ) + } + + fn get_cardinality(samples: &[[u8; 32]]) -> f64 { let mut buckets = [0.0; 32]; - for metric in metrics.chunks_exact(32) { - if filter(&metric[0]) { - for (i, m) in metric.iter().enumerate() { - buckets[i] = f64::max(buckets[i], m.get_gauge().get_value()); - } + for &sample in samples { + for (i, m) in sample.into_iter().enumerate() { + buckets[i] = f64::max(buckets[i], m as f64); } } @@ -437,7 +238,7 @@ mod tests { } fn test_cardinality(n: usize, dist: impl Distribution) -> ([usize; 3], [f64; 3]) { - let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap(); + let hll = HyperLogLogVec::, 32>::new(); let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist); let mut set_a = HashSet::new(); @@ -445,18 +246,20 @@ mod tests { for x in iter.by_ref().take(n) { set_a.insert(x.to_bits()); - hll.with_label_values(&["a"]).measure(&x.to_bits()); + hll.get_metric(hll.with_labels(Label::A)) + .measure(&x.to_bits()); } for x in iter.by_ref().take(n) { set_b.insert(x.to_bits()); - hll.with_label_values(&["b"]).measure(&x.to_bits()); + hll.get_metric(hll.with_labels(Label::B)) + .measure(&x.to_bits()); } let merge = &set_a | &set_b; - let metrics = collect(&hll); - let len = get_cardinality(&metrics, |_| true); - let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a"); - let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b"); + let (a, b) = collect(&hll); + let len = get_cardinality(&[a, b]); + let len_a = get_cardinality(&[a]); + let len_b = get_cardinality(&[b]); ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b]) } diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 22b0a18933..8e0dbe6ce4 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -4,6 +4,17 @@ //! a default registry. #![deny(clippy::undocumented_unsafe_blocks)] +use measured::{ + label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}, + metric::{ + counter::CounterState, + gauge::GaugeState, + group::{Encoding, MetricValue}, + name::{MetricName, MetricNameEncoder}, + MetricEncoding, MetricFamilyEncoding, + }, + FixedCardinalityLabel, LabelGroup, MetricGroup, +}; use once_cell::sync::Lazy; use prometheus::core::{ Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, @@ -11,6 +22,7 @@ use prometheus::core::{ pub use prometheus::opts; pub use prometheus::register; pub use prometheus::Error; +use prometheus::Registry; pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_counter_vec, Counter, CounterVec}; @@ -23,13 +35,12 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec}; pub use prometheus::{register_int_gauge, IntGauge}; pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{Encoder, TextEncoder}; -use prometheus::{Registry, Result}; pub mod launch_timestamp; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; mod hll; -pub use hll::{HyperLogLog, HyperLogLogVec}; +pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec}; #[cfg(target_os = "linux")] pub mod more_process_metrics; @@ -59,7 +70,7 @@ static INTERNAL_REGISTRY: Lazy = Lazy::new(Registry::new); /// Register a collector in the internal registry. MUST be called before the first call to `gather()`. /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector /// while holding the lock. -pub fn register_internal(c: Box) -> Result<()> { +pub fn register_internal(c: Box) -> prometheus::Result<()> { INTERNAL_REGISTRY.register(c) } @@ -96,6 +107,127 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[ 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, ]; +pub struct BuildInfo { + pub revision: &'static str, + pub build_tag: &'static str, +} + +// todo: allow label group without the set +impl LabelGroup for BuildInfo { + fn visit_values(&self, v: &mut impl LabelGroupVisitor) { + const REVISION: &LabelName = LabelName::from_str("revision"); + v.write_value(REVISION, &self.revision); + const BUILD_TAG: &LabelName = LabelName::from_str("build_tag"); + v.write_value(BUILD_TAG, &self.build_tag); + } +} + +impl MetricFamilyEncoding for BuildInfo +where + GaugeState: MetricEncoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut T, + ) -> Result<(), T::Err> { + enc.write_help(&name, "Build/version information")?; + GaugeState::write_type(&name, enc)?; + GaugeState { + count: std::sync::atomic::AtomicI64::new(1), + } + .collect_into(&(), self, name, enc) + } +} + +#[derive(MetricGroup)] +#[metric(new(build_info: BuildInfo))] +pub struct NeonMetrics { + #[cfg(target_os = "linux")] + #[metric(namespace = "process")] + #[metric(init = measured_process::ProcessCollector::for_self())] + process: measured_process::ProcessCollector, + + #[metric(namespace = "libmetrics")] + #[metric(init = LibMetrics::new(build_info))] + libmetrics: LibMetrics, +} + +#[derive(MetricGroup)] +#[metric(new(build_info: BuildInfo))] +pub struct LibMetrics { + #[metric(init = build_info)] + build_info: BuildInfo, + + #[metric(flatten)] + rusage: Rusage, + + serve_count: CollectionCounter, +} + +fn write_gauge( + x: i64, + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Enc, +) -> Result<(), Enc::Err> { + enc.write_metric_value(name, labels, MetricValue::Int(x)) +} + +#[derive(Default)] +struct Rusage; + +#[derive(FixedCardinalityLabel, Clone, Copy)] +#[label(singleton = "io_operation")] +enum IoOp { + Read, + Write, +} + +impl MetricGroup for Rusage +where + GaugeState: MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total"); + const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb"); + + let ru = get_rusage_stats(); + + enc.write_help( + DISK_IO, + "Bytes written and read from disk, grouped by the operation (read|write)", + )?; + GaugeState::write_type(DISK_IO, enc)?; + write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?; + write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?; + + enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?; + GaugeState::write_type(MAXRSS, enc)?; + write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?; + + Ok(()) + } +} + +#[derive(Default)] +struct CollectionCounter(CounterState); + +impl MetricFamilyEncoding for CollectionCounter +where + CounterState: MetricEncoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut T, + ) -> Result<(), T::Err> { + self.0.inc(); + enc.write_help(&name, "Number of metric requests made")?; + self.0.collect_into(&(), NoLabels, name, enc) + } +} + pub fn set_build_info_metric(revision: &str, build_tag: &str) { let metric = register_int_gauge_vec!( "libmetrics_build_info", @@ -105,6 +237,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { .expect("Failed to register build info metric"); metric.with_label_values(&[revision, build_tag]).set(1); } +const BYTES_IN_BLOCK: i64 = 512; // Records I/O stats in a "cross-platform" way. // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats. @@ -117,14 +250,22 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { fn update_rusage_metrics() { let rusage_stats = get_rusage_stats(); - const BYTES_IN_BLOCK: i64 = 512; DISK_IO_BYTES .with_label_values(&["read"]) .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK); DISK_IO_BYTES .with_label_values(&["write"]) .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK); - MAXRSS_KB.set(rusage_stats.ru_maxrss); + + // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669 + #[cfg(target_os = "macos")] + { + MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024); + } + #[cfg(not(target_os = "macos"))] + { + MAXRSS_KB.set(rusage_stats.ru_maxrss); + } } fn get_rusage_stats() -> libc::rusage { @@ -151,6 +292,7 @@ macro_rules! register_int_counter_pair_vec { } }}; } + /// Create an [`IntCounterPair`] and registers to default registry. #[macro_export(local_inner_macros)] macro_rules! register_int_counter_pair { @@ -188,7 +330,10 @@ impl GenericCounterPairVec

{ /// /// An error is returned if the number of label values is not the same as the /// number of VariableLabels in Desc. - pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result> { + pub fn get_metric_with_label_values( + &self, + vals: &[&str], + ) -> prometheus::Result> { Ok(GenericCounterPair { inc: self.inc.get_metric_with_label_values(vals)?, dec: self.dec.get_metric_with_label_values(vals)?, @@ -201,7 +346,7 @@ impl GenericCounterPairVec

{ self.get_metric_with_label_values(vals).unwrap() } - pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) { + pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) { res[0] = self.inc.remove_label_values(vals); res[1] = self.dec.remove_label_values(vals); } @@ -285,3 +430,171 @@ pub type IntCounterPair = GenericCounterPair; /// A guard for [`IntCounterPair`] that will decrement the gauge on drop pub type IntCounterPairGuard = GenericCounterPairGuard; + +pub trait CounterPairAssoc { + const INC_NAME: &'static MetricName; + const DEC_NAME: &'static MetricName; + + const INC_HELP: &'static str; + const DEC_HELP: &'static str; + + type LabelGroupSet: LabelGroupSet; +} + +pub struct CounterPairVec { + vec: measured::metric::MetricVec, +} + +impl Default for CounterPairVec +where + A::LabelGroupSet: Default, +{ + fn default() -> Self { + Self { + vec: Default::default(), + } + } +} + +impl CounterPairVec { + pub fn guard( + &self, + labels: ::Group<'_>, + ) -> MeasuredCounterPairGuard<'_, A> { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).inc.inc(); + MeasuredCounterPairGuard { vec: &self.vec, id } + } + pub fn inc(&self, labels: ::Group<'_>) { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).inc.inc(); + } + pub fn dec(&self, labels: ::Group<'_>) { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).dec.inc(); + } + pub fn remove_metric( + &self, + labels: ::Group<'_>, + ) -> Option { + let id = self.vec.with_labels(labels); + self.vec.remove_metric(id) + } +} + +impl ::measured::metric::group::MetricGroup for CounterPairVec +where + T: ::measured::metric::group::Encoding, + A: CounterPairAssoc, + ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + // write decrement first to avoid a race condition where inc - dec < 0 + T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?; + self.vec + .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?; + + T::write_help(enc, A::INC_NAME, A::INC_HELP)?; + self.vec + .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?; + + Ok(()) + } +} + +#[derive(MetricGroup, Default)] +pub struct MeasuredCounterPairState { + pub inc: CounterState, + pub dec: CounterState, +} + +impl measured::metric::MetricType for MeasuredCounterPairState { + type Metadata = (); +} + +pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> { + vec: &'a measured::metric::MetricVec, + id: measured::metric::LabelId, +} + +impl Drop for MeasuredCounterPairGuard<'_, A> { + fn drop(&mut self) { + self.vec.get_metric(self.id).dec.inc(); + } +} + +/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder. +struct Inc(T); +/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder. +struct Dec(T); + +impl Encoding for Inc { + type Err = T::Err; + + fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { + self.0.write_help(name, help) + } + + fn write_metric_value( + &mut self, + name: impl MetricNameEncoder, + labels: impl LabelGroup, + value: MetricValue, + ) -> Result<(), Self::Err> { + self.0.write_metric_value(name, labels, value) + } +} + +impl MetricEncoding> for MeasuredCounterPairState +where + CounterState: MetricEncoding, +{ + fn write_type(name: impl MetricNameEncoder, enc: &mut Inc) -> Result<(), T::Err> { + CounterState::write_type(name, &mut enc.0) + } + fn collect_into( + &self, + metadata: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Inc, + ) -> Result<(), T::Err> { + self.inc.collect_into(metadata, labels, name, &mut enc.0) + } +} + +impl Encoding for Dec { + type Err = T::Err; + + fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { + self.0.write_help(name, help) + } + + fn write_metric_value( + &mut self, + name: impl MetricNameEncoder, + labels: impl LabelGroup, + value: MetricValue, + ) -> Result<(), Self::Err> { + self.0.write_metric_value(name, labels, value) + } +} + +/// Write the dec counter to the encoder +impl MetricEncoding> for MeasuredCounterPairState +where + CounterState: MetricEncoding, +{ + fn write_type(name: impl MetricNameEncoder, enc: &mut Dec) -> Result<(), T::Err> { + CounterState::write_type(name, &mut enc.0) + } + fn collect_into( + &self, + metadata: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Dec, + ) -> Result<(), T::Err> { + self.dec.collect_into(metadata, labels, name, &mut enc.0) + } +} diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs new file mode 100644 index 0000000000..d996a62349 --- /dev/null +++ b/libs/pageserver_api/src/config.rs @@ -0,0 +1,31 @@ +use std::collections::HashMap; + +use const_format::formatcp; + +#[cfg(test)] +mod tests; + +pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; +pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); +pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; +pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); + +// Certain metadata (e.g. externally-addressable name, AZ) is delivered +// as a separate structure. This information is not neeed by the pageserver +// itself, it is only used for registering the pageserver with the control +// plane and/or storage controller. +// +#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)] +pub struct NodeMetadata { + #[serde(rename = "host")] + pub postgres_host: String, + #[serde(rename = "port")] + pub postgres_port: u16, + pub http_host: String, + pub http_port: u16, + + // Deployment tools may write fields to the metadata file beyond what we + // use in this type: this type intentionally only names fields that require. + #[serde(flatten)] + pub other: HashMap, +} diff --git a/libs/pageserver_api/src/config/tests.rs b/libs/pageserver_api/src/config/tests.rs new file mode 100644 index 0000000000..edeefc156e --- /dev/null +++ b/libs/pageserver_api/src/config/tests.rs @@ -0,0 +1,22 @@ +use super::*; + +#[test] +fn test_node_metadata_v1_backward_compatibilty() { + let v1 = serde_json::to_vec(&serde_json::json!({ + "host": "localhost", + "port": 23, + "http_host": "localhost", + "http_port": 42, + })); + + assert_eq!( + serde_json::from_slice::(&v1.unwrap()).unwrap(), + NodeMetadata { + postgres_host: "localhost".to_string(), + postgres_port: 23, + http_host: "localhost".to_string(), + http_port: 42, + other: HashMap::new(), + } + ) +} diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 852670af2c..2511de00d5 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,5 +1,6 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; +use bytes::BufMut; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; @@ -21,15 +22,107 @@ pub struct Key { pub field6: u32, } +/// The storage key size. pub const KEY_SIZE: usize = 18; +/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized. +/// See [`Key::to_i128`] for more information on the encoding. +pub const METADATA_KEY_SIZE: usize = 16; + +/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key. +pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60; +pub const METADATA_KEY_END_PREFIX: u8 = 0x7F; + +/// The (reserved) key prefix of relation sizes. +pub const RELATION_SIZE_PREFIX: u8 = 0x61; + +/// The key prefix of AUX file keys. +pub const AUX_KEY_PREFIX: u8 = 0x62; + +/// Check if the key falls in the range of metadata keys. +pub const fn is_metadata_key_slice(key: &[u8]) -> bool { + key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX +} + impl Key { + /// Check if the key falls in the range of metadata keys. + pub const fn is_metadata_key(&self) -> bool { + self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX + } + + /// Encode a metadata key to a storage key. + pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self { + assert!(is_metadata_key_slice(key), "key not in metadata key range"); + Key { + field1: key[0], + field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32, + field3: u32::from_be_bytes(key[3..7].try_into().unwrap()), + field4: u32::from_be_bytes(key[7..11].try_into().unwrap()), + field5: key[11], + field6: u32::from_be_bytes(key[12..16].try_into().unwrap()), + } + } + + /// Encode a metadata key to a storage key. + pub fn from_metadata_key(key: &[u8]) -> Self { + Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key")) + } + + /// Extract a metadata key to a writer. The result should always be 16 bytes. + pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) { + writer.put_u8(self.field1); + assert!(self.field2 <= 0xFFFF); + writer.put_u16(self.field2 as u16); + writer.put_u32(self.field3); + writer.put_u32(self.field4); + writer.put_u8(self.field5); + writer.put_u32(self.field6); + } + + /// Get the range of metadata keys. + pub const fn metadata_key_range() -> Range { + Key { + field1: METADATA_KEY_BEGIN_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: METADATA_KEY_END_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } + } + + /// Get the range of aux keys. + pub fn metadata_aux_key_range() -> Range { + Key { + field1: AUX_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: AUX_KEY_PREFIX + 1, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } + } + /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. /// As long as Neon does not support tablespace (because of lack of access to local file system), /// we can assume that only some predefined namespace OIDs are used which can fit in u16 pub fn to_i128(&self) -> i128 { assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); - (((self.field1 & 0xf) as i128) << 120) + (((self.field1 & 0x7F) as i128) << 120) | (((self.field2 & 0xFFFF) as i128) << 104) | ((self.field3 as i128) << 72) | ((self.field4 as i128) << 40) @@ -39,7 +132,7 @@ impl Key { pub const fn from_i128(x: i128) -> Self { Key { - field1: ((x >> 120) & 0xf) as u8, + field1: ((x >> 120) & 0x7F) as u8, field2: ((x >> 104) & 0xFFFF) as u32, field3: (x >> 72) as u32, field4: (x >> 40) as u32, @@ -48,11 +141,11 @@ impl Key { } } - pub fn next(&self) -> Key { + pub const fn next(&self) -> Key { self.add(1) } - pub fn add(&self, x: u32) -> Key { + pub const fn add(&self, x: u32) -> Key { let mut key = *self; let r = key.field6.overflowing_add(x); @@ -81,6 +174,8 @@ impl Key { key } + /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently. + /// Use [`Key::from_metadata_key`] instead. pub fn from_slice(b: &[u8]) -> Self { Key { field1: b[0], @@ -92,6 +187,8 @@ impl Key { } } + /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently. + /// Use [`Key::extract_metadata_key_to_writer`] instead. pub fn write_to_byte_slice(&self, buf: &mut [u8]) { buf[0] = self.field1; BE::write_u32(&mut buf[1..5], self.field2); @@ -475,12 +572,17 @@ pub const AUX_FILES_KEY: Key = Key { // Reverse mappings for a few Keys. // These are needed by WAL redo manager. +/// Non inherited range for vectored get. +pub const NON_INHERITED_RANGE: Range = AUX_FILES_KEY..AUX_FILES_KEY.next(); +/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range. +pub const NON_INHERITED_SPARSE_RANGE: Range = Key::metadata_key_range(); + // AUX_FILES currently stores only data for logical replication (slots etc), and // we don't preserve these on a branch because safekeepers can't follow timeline // switch (and generally it likely should be optional), so ignore these. #[inline(always)] pub fn is_inherited_key(key: Key) -> bool { - key != AUX_FILES_KEY + !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key) } #[inline(always)] @@ -556,11 +658,14 @@ impl std::str::FromStr for Key { mod tests { use std::str::FromStr; + use crate::key::is_metadata_key_slice; use crate::key::Key; use rand::Rng; use rand::SeedableRng; + use super::AUX_KEY_PREFIX; + #[test] fn display_fromstr_bijection() { let mut rng = rand::rngs::StdRng::seed_from_u64(42); @@ -576,4 +681,16 @@ mod tests { assert_eq!(key, Key::from_str(&format!("{key}")).unwrap()); } + + #[test] + fn test_metadata_keys() { + let mut metadata_key = vec![AUX_KEY_PREFIX]; + metadata_key.extend_from_slice(&[0xFF; 15]); + let encoded_key = Key::from_metadata_key(&metadata_key); + let mut output_key = Vec::new(); + encoded_key.extract_metadata_key_to_writer(&mut output_key); + assert_eq!(metadata_key, output_key); + assert!(encoded_key.is_metadata_key()); + assert!(is_metadata_key_slice(&metadata_key)); + } } diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 05fa4562e1..a9ad3aca18 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -1,7 +1,10 @@ use postgres_ffi::BLCKSZ; use std::ops::Range; -use crate::key::Key; +use crate::{ + key::Key, + shard::{ShardCount, ShardIdentity}, +}; use itertools::Itertools; /// @@ -14,44 +17,279 @@ pub struct KeySpace { pub ranges: Vec>, } -impl KeySpace { +/// A wrapper type for sparse keyspaces. +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct SparseKeySpace(pub KeySpace); + +/// Represents a contiguous half-open range of the keyspace, masked according to a particular +/// ShardNumber's stripes: within this range of keys, only some "belong" to the current +/// shard. +/// +/// When we iterate over keys within this object, we will skip any keys that don't belong +/// to this shard. +/// +/// The start + end keys may not belong to the shard: these specify where layer files should +/// start + end, but we will never actually read/write those keys. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ShardedRange<'a> { + pub shard_identity: &'a ShardIdentity, + pub range: Range, +} + +// Calculate the size of a range within the blocks of the same relation, or spanning only the +// top page in the previous relation's space. +fn contiguous_range_len(range: &Range) -> u32 { + debug_assert!(is_contiguous_range(range)); + if range.start.field6 == 0xffffffff { + range.end.field6 + 1 + } else { + range.end.field6 - range.start.field6 + } +} + +/// Return true if this key range includes only keys in the same relation's data blocks, or +/// just spanning one relation and the logical size (0xffffffff) block of the relation before it. +/// +/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not +/// be on our shard. Later in ShardedRange we do the extra work to figure out how much +/// of a given contiguous range is present on one shard. +/// +/// This matters, because: +/// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse. +/// - Within such ranges, we may calculate distances using simple subtraction of field6. +fn is_contiguous_range(range: &Range) -> bool { + range.start.field1 == range.end.field1 + && range.start.field2 == range.end.field2 + && range.start.field3 == range.end.field3 + && range.start.field4 == range.end.field4 + && (range.start.field5 == range.end.field5 + || (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5)) +} + +impl<'a> ShardedRange<'a> { + pub fn new(range: Range, shard_identity: &'a ShardIdentity) -> Self { + Self { + shard_identity, + range, + } + } + + /// Break up this range into chunks, each of which has at least one local key in it if the + /// total range has at least one local key. + pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range)> { + // Optimization for single-key case (e.g. logical size keys) + if self.range.end == self.range.start.add(1) { + return vec![( + if self.shard_identity.is_key_disposable(&self.range.start) { + 0 + } else { + 1 + }, + self.range, + )]; + } + + if !is_contiguous_range(&self.range) { + // Ranges that span relations are not fragmented. We only get these ranges as a result + // of operations that act on existing layers, so we trust that the existing range is + // reasonably small. + return vec![(u32::MAX, self.range)]; + } + + let mut fragments: Vec<(u32, Range)> = Vec::new(); + + let mut cursor = self.range.start; + while cursor < self.range.end { + let advance_by = self.distance_to_next_boundary(cursor); + let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor); + + // If the previous fragment is undersized, then we seek to consume enough + // blocks to complete it. + let (want_blocks, merge_last_fragment) = match fragments.last_mut() { + Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)), + Some(frag) => { + // Prev block is complete, want the full number. + ( + target_nblocks, + if is_fragment_disposable { + // If this current range will be empty (not shard-local data), we will merge into previous + Some(frag) + } else { + None + }, + ) + } + None => { + // First iteration, want the full number + (target_nblocks, None) + } + }; + + let advance_by = if is_fragment_disposable { + advance_by + } else { + std::cmp::min(advance_by, want_blocks) + }; + + let next_cursor = cursor.add(advance_by); + + let this_frag = ( + if is_fragment_disposable { + 0 + } else { + advance_by + }, + cursor..next_cursor, + ); + cursor = next_cursor; + + if let Some(last_fragment) = merge_last_fragment { + // Previous fragment was short or this one is empty, merge into it + last_fragment.0 += this_frag.0; + last_fragment.1.end = this_frag.1.end; + } else { + fragments.push(this_frag); + } + } + + fragments + } + + /// Estimate the physical pages that are within this range, on this shard. This returns + /// u32::MAX if the range spans relations: this return value should be interpreted as "large". + pub fn page_count(&self) -> u32 { + // Special cases for single keys like logical sizes + if self.range.end == self.range.start.add(1) { + return if self.shard_identity.is_key_disposable(&self.range.start) { + 0 + } else { + 1 + }; + } + + // We can only do an authentic calculation of contiguous key ranges + if !is_contiguous_range(&self.range) { + return u32::MAX; + } + + // Special case for single sharded tenants: our logical and physical sizes are the same + if self.shard_identity.count < ShardCount::new(2) { + return contiguous_range_len(&self.range); + } + + // Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs + // to Self, and add the stripe's block count to our total if so. + let mut result: u64 = 0; + let mut cursor = self.range.start; + while cursor < self.range.end { + // Count up to the next stripe_size boundary or end of range + let advance_by = self.distance_to_next_boundary(cursor); + + // If this blocks in this stripe belong to us, add them to our count + if !self.shard_identity.is_key_disposable(&cursor) { + result += advance_by as u64; + } + + cursor = cursor.add(advance_by); + } + + if result > u32::MAX as u64 { + u32::MAX + } else { + result as u32 + } + } + + /// Advance the cursor to the next potential fragment boundary: this is either + /// a stripe boundary, or the end of the range. + fn distance_to_next_boundary(&self, cursor: Key) -> u32 { + let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end)); + + if self.shard_identity.count < ShardCount::new(2) { + // Optimization: don't bother stepping through stripes if the tenant isn't sharded. + return distance_to_range_end; + } + + if cursor.field6 == 0xffffffff { + // We are wrapping from one relation's logical size to the next relation's first data block + return 1; + } + + let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0; + let stripe_remainder = self.shard_identity.stripe_size.0 + - (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0); + + if cfg!(debug_assertions) { + // We should never overflow field5 and field6 -- our callers check this earlier + // and would have returned their u32::MAX cases if the input range violated this. + let next_cursor = cursor.add(stripe_remainder); + debug_assert!( + next_cursor.field1 == cursor.field1 + && next_cursor.field2 == cursor.field2 + && next_cursor.field3 == cursor.field3 + && next_cursor.field4 == cursor.field4 + && next_cursor.field5 == cursor.field5 + ) + } + + std::cmp::min(stripe_remainder, distance_to_range_end) + } + + /// Whereas `page_count` estimates the number of pages physically in this range on this shard, + /// this function simply calculates the number of pages in the space, without accounting for those + /// pages that would not actually be stored on this node. /// + /// Don't use this function in code that works with physical entities like layer files. + fn raw_size(range: &Range) -> u32 { + if is_contiguous_range(range) { + contiguous_range_len(range) + } else { + u32::MAX + } + } +} + +impl KeySpace { + /// Create a key space with a single range. + pub fn single(key_range: Range) -> Self { + Self { + ranges: vec![key_range], + } + } + /// Partition a key space into roughly chunks of roughly 'target_size' bytes /// in each partition. /// - pub fn partition(&self, target_size: u64) -> KeyPartitioning { + pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning { // Assume that each value is 8k in size. - let target_nblocks = (target_size / BLCKSZ as u64) as usize; + let target_nblocks = (target_size / BLCKSZ as u64) as u32; let mut parts = Vec::new(); let mut current_part = Vec::new(); let mut current_part_size: usize = 0; for range in &self.ranges { - // If appending the next contiguous range in the keyspace to the current - // partition would cause it to be too large, start a new partition. - let this_size = key_range_size(range) as usize; - if current_part_size + this_size > target_nblocks && !current_part.is_empty() { - parts.push(KeySpace { - ranges: current_part, - }); - current_part = Vec::new(); - current_part_size = 0; - } + // While doing partitioning, wrap the range in ShardedRange so that our size calculations + // will respect shard striping rather than assuming all keys within a range are present. + let range = ShardedRange::new(range.clone(), shard_identity); - // If the next range is larger than 'target_size', split it into - // 'target_size' chunks. - let mut remain_size = this_size; - let mut start = range.start; - while remain_size > target_nblocks { - let next = start.add(target_nblocks as u32); - parts.push(KeySpace { - ranges: vec![start..next], - }); - start = next; - remain_size -= target_nblocks + // Chunk up the range into parts that each contain up to target_size local blocks + for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) { + // If appending the next contiguous range in the keyspace to the current + // partition would cause it to be too large, and our current partition + // covers at least one block that is physically present in this shard, + // then start a new partition + if current_part_size + frag_on_shard_size as usize > target_nblocks as usize + && current_part_size > 0 + { + parts.push(KeySpace { + ranges: current_part, + }); + current_part = Vec::new(); + current_part_size = 0; + } + current_part.push(frag_range.start..frag_range.end); + current_part_size += frag_on_shard_size as usize; } - current_part.push(start..range.end); - current_part_size += remain_size; } // add last partition that wasn't full yet. @@ -64,6 +302,10 @@ impl KeySpace { KeyPartitioning { parts } } + pub fn is_empty(&self) -> bool { + self.total_raw_size() == 0 + } + /// Merge another keyspace into the current one. /// Note: the keyspaces must not ovelap (enforced via assertions) pub fn merge(&mut self, other: &KeySpace) { @@ -94,12 +336,13 @@ impl KeySpace { /// Remove all keys in `other` from `self`. /// This can involve splitting or removing of existing ranges. - pub fn remove_overlapping_with(&mut self, other: &KeySpace) { + /// Returns the removed keyspace + pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace { let (self_start, self_end) = match (self.start(), self.end()) { (Some(start), Some(end)) => (start, end), _ => { // self is empty - return; + return KeySpace::default(); } }; @@ -112,30 +355,37 @@ impl KeySpace { .skip_while(|range| self_start >= range.end) .take_while(|range| self_end > range.start); + let mut removed_accum = KeySpaceRandomAccum::new(); for range in other_ranges { while let Some(overlap_at) = self.overlaps_at(range) { let overlapped = self.ranges[overlap_at].clone(); if overlapped.start < range.start && overlapped.end <= range.end { // Higher part of the range is completely overlapped. + removed_accum.add_range(range.start..self.ranges[overlap_at].end); self.ranges[overlap_at].end = range.start; } if overlapped.start >= range.start && overlapped.end > range.end { // Lower part of the range is completely overlapped. + removed_accum.add_range(self.ranges[overlap_at].start..range.end); self.ranges[overlap_at].start = range.end; } if overlapped.start < range.start && overlapped.end > range.end { // Middle part of the range is overlapped. + removed_accum.add_range(range.clone()); self.ranges[overlap_at].end = range.start; self.ranges .insert(overlap_at + 1, range.end..overlapped.end); } if overlapped.start >= range.start && overlapped.end <= range.end { // Whole range is overlapped + removed_accum.add_range(self.ranges[overlap_at].clone()); self.ranges.remove(overlap_at); } } } + + removed_accum.to_keyspace() } pub fn start(&self) -> Option { @@ -146,11 +396,11 @@ impl KeySpace { self.ranges.last().map(|range| range.end) } - #[allow(unused)] - pub fn total_size(&self) -> usize { + /// The size of the keyspace in pages, before accounting for sharding + pub fn total_raw_size(&self) -> usize { self.ranges .iter() - .map(|range| key_range_size(range) as usize) + .map(|range| ShardedRange::raw_size(range) as usize) .sum() } @@ -170,6 +420,11 @@ impl KeySpace { pub fn overlaps(&self, range: &Range) -> bool { self.overlaps_at(range).is_some() } + + /// Check if the keyspace contains a key + pub fn contains(&self, key: &Key) -> bool { + self.overlaps(&(*key..key.next())) + } } /// @@ -184,10 +439,33 @@ pub struct KeyPartitioning { pub parts: Vec, } +/// Represents a partitioning of the sparse key space. +#[derive(Clone, Debug, Default)] +pub struct SparseKeyPartitioning { + pub parts: Vec, +} + impl KeyPartitioning { pub fn new() -> Self { KeyPartitioning { parts: Vec::new() } } + + /// Convert a key partitioning to a sparse partition. + pub fn into_sparse(self) -> SparseKeyPartitioning { + SparseKeyPartitioning { + parts: self.parts.into_iter().map(SparseKeySpace).collect(), + } + } +} + +impl SparseKeyPartitioning { + /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will + /// cause long/dead loops. + pub fn into_dense(self) -> KeyPartitioning { + KeyPartitioning { + parts: self.parts.into_iter().map(|x| x.0).collect(), + } + } } /// @@ -219,7 +497,7 @@ impl KeySpaceAccum { #[inline(always)] pub fn add_range(&mut self, range: Range) { - self.size += key_range_size(&range) as u64; + self.size += ShardedRange::raw_size(&range) as u64; match self.accum.as_mut() { Some(accum) => { @@ -251,7 +529,9 @@ impl KeySpaceAccum { std::mem::take(self).to_keyspace() } - pub fn size(&self) -> u64 { + // The total number of keys in this object, ignoring any sharding effects that might cause some of + // the keys to be omitted in storage on this shard. + pub fn raw_size(&self) -> u64 { self.size } } @@ -307,36 +587,19 @@ impl KeySpaceRandomAccum { } } -#[inline(always)] -pub fn key_range_size(key_range: &Range) -> u32 { - let start = key_range.start; - let end = key_range.end; - - if end.field1 != start.field1 - || end.field2 != start.field2 - || end.field3 != start.field3 - || end.field4 != start.field4 - { - return u32::MAX; - } - - let start = (start.field5 as u64) << 32 | start.field6 as u64; - let end = (end.field5 as u64) << 32 | end.field6 as u64; - - let diff = end - start; - if diff > u32::MAX as u64 { - u32::MAX - } else { - diff as u32 - } -} - pub fn singleton_range(key: Key) -> Range { key..key.next() } #[cfg(test)] mod tests { + use rand::{RngCore, SeedableRng}; + + use crate::{ + models::ShardParameters, + shard::{ShardCount, ShardNumber}, + }; + use super::*; use std::fmt::Write; @@ -379,14 +642,17 @@ mod tests { accum.add_range(range.clone()); } - let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum(); - assert_eq!(accum.size(), expected_size); + let expected_size: u64 = ranges + .iter() + .map(|r| ShardedRange::raw_size(r) as u64) + .sum(); + assert_eq!(accum.raw_size(), expected_size); assert_ks_eq(&accum.consume_keyspace(), ranges.clone()); - assert_eq!(accum.size(), 0); + assert_eq!(accum.raw_size(), 0); assert_ks_eq(&accum.consume_keyspace(), vec![]); - assert_eq!(accum.size(), 0); + assert_eq!(accum.raw_size(), 0); for range in &ranges { accum.add_range(range.clone()); @@ -553,7 +819,16 @@ mod tests { Key::from_i128(11)..Key::from_i128(13), ], }; - key_space1.remove_overlapping_with(&key_space2); + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(2)..Key::from_i128(3), + Key::from_i128(6)..Key::from_i128(7), + Key::from_i128(11)..Key::from_i128(12), + ], + }; + assert_eq!(removed, removed_expected); + assert_eq!( key_space1.ranges, vec![ @@ -583,7 +858,17 @@ mod tests { Key::from_i128(14)..Key::from_i128(17), ], }; - key_space1.remove_overlapping_with(&key_space2); + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(3)..Key::from_i128(5), + Key::from_i128(8)..Key::from_i128(10), + Key::from_i128(14)..Key::from_i128(15), + ], + }; + assert_eq!(removed, removed_expected); + assert_eq!( key_space1.ranges, vec![ @@ -610,7 +895,11 @@ mod tests { Key::from_i128(15)..Key::from_i128(17), ], }; - key_space1.remove_overlapping_with(&key_space2); + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace::default(); + assert_eq!(removed, removed_expected); + assert_eq!( key_space1.ranges, vec![ @@ -637,7 +926,17 @@ mod tests { let key_space2 = KeySpace { ranges: vec![Key::from_i128(9)..Key::from_i128(19)], }; - key_space1.remove_overlapping_with(&key_space2); + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(9)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + Key::from_i128(17)..Key::from_i128(19), + ], + }; + assert_eq!(removed, removed_expected); + assert_eq!( key_space1.ranges, vec![ @@ -650,4 +949,412 @@ mod tests { ] ); } + #[test] + fn sharded_range_relation_gap() { + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067F00000005000040100300000000").unwrap(), + end: Key::from_hex("000000067F00000005000040130000004000").unwrap(), + }, + &shard_identity, + ); + + // Key range spans relations, expect MAX + assert_eq!(range.page_count(), u32::MAX); + } + + #[test] + fn shard_identity_keyspaces_single_key() { + let shard_identity = ShardIdentity::new( + ShardNumber(1), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(), + end: Key::from_hex("000000067f00000001000000700100000000").unwrap(), + }, + &shard_identity, + ); + // Single-key range on logical size key + assert_eq!(range.page_count(), 1); + } + + /// Test the helper that we use to identify ranges which go outside the data blocks of a single relation + #[test] + fn contiguous_range_check() { + assert!(!is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap() + ..Key::from_hex("000000067f00000001000004df0100000003").unwrap()) + ),); + + // The ranges goes all the way up to the 0xffffffff, including it: this is + // not considered a rel block range because 0xffffffff stores logical sizes, + // not blocks. + assert!(!is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap() + ..Key::from_hex("000000067f00000001000004df0100000000").unwrap()) + ),); + + // Keys within the normal data region of a relation + assert!(is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df0000000000").unwrap() + ..Key::from_hex("000000067f00000001000004df0000000080").unwrap()) + ),); + + // The logical size key of one forkno, then some blocks in the next + assert!(is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap() + ..Key::from_hex("000000067f00000001000004df0100000080").unwrap()) + ),); + } + + #[test] + fn shard_identity_keyspaces_forkno_gap() { + let shard_identity = ShardIdentity::new( + ShardNumber(1), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(), + end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(), + }, + &shard_identity, + ); + + // Range spanning the end of one forkno and the start of the next: we do not attempt to + // calculate a valid size, because we have no way to know if they keys between start + // and end are actually in use. + assert_eq!(range.page_count(), u32::MAX); + } + + #[test] + fn shard_identity_keyspaces_one_relation() { + for shard_number in 0..4 { + let shard_identity = ShardIdentity::new( + ShardNumber(shard_number), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(), + end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(), + }, + &shard_identity, + ); + + // Very simple case: range covering block zero of one relation, where that block maps to shard zero + if shard_number == 0 { + assert_eq!(range.page_count(), 1); + } else { + // Other shards should perceive the range's size as zero + assert_eq!(range.page_count(), 0); + } + } + } + + /// Test helper: construct a ShardedRange and call fragment() on it, returning + /// the total page count in the range and the fragments. + fn do_fragment( + range_start: Key, + range_end: Key, + shard_identity: &ShardIdentity, + target_nblocks: u32, + ) -> (u32, Vec<(u32, Range)>) { + let range = ShardedRange::new( + Range { + start: range_start, + end: range_end, + }, + shard_identity, + ); + + let page_count = range.page_count(); + let fragments = range.fragment(target_nblocks); + + // Invariant: we always get at least one fragment + assert!(!fragments.is_empty()); + + // Invariant: the first/last fragment start/end should equal the input start/end + assert_eq!(fragments.first().unwrap().1.start, range_start); + assert_eq!(fragments.last().unwrap().1.end, range_end); + + if page_count > 0 { + // Invariant: every fragment must contain at least one shard-local page, if the + // total range contains at least one shard-local page + let all_nonzero = fragments.iter().all(|f| f.0 > 0); + if !all_nonzero { + eprintln!("Found a zero-length fragment: {:?}", fragments); + } + assert!(all_nonzero); + } else { + // A range with no shard-local pages should always be returned as a single fragment + assert_eq!(fragments, vec![(0, range_start..range_end)]); + } + + // Invariant: fragments must be ordered and non-overlapping + let mut last: Option> = None; + for frag in &fragments { + if let Some(last) = last { + assert!(frag.1.start >= last.end); + assert!(frag.1.start > last.start); + } + last = Some(frag.1.clone()) + } + + // Invariant: fragments respect target_nblocks + for frag in &fragments { + assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks); + } + + (page_count, fragments) + } + + /// Really simple tests for fragment(), on a range that just contains a single stripe + /// for a single tenant. + #[test] + fn sharded_range_fragment_simple() { + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + // A range which we happen to know covers exactly one stripe which belongs to this shard + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap(); + + // Ask for stripe_size blocks, we get the whole stripe + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 32768), + (32768, vec![(32768, input_start..input_end)]) + ); + + // Ask for more, we still get the whole stripe + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 10000000), + (32768, vec![(32768, input_start..input_end)]) + ); + + // Ask for target_nblocks of half the stripe size, we get two halves + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 16384), + ( + 32768, + vec![ + (16384, input_start..input_start.add(16384)), + (16384, input_start.add(16384)..input_end) + ] + ) + ); + } + + #[test] + fn sharded_range_fragment_multi_stripe() { + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + // A range which covers multiple stripes, exactly one of which belongs to the current shard. + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + // Ask for all the blocks, get a fragment that covers the whole range but reports + // its size to be just the blocks belonging to our shard. + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 131072), + (32768, vec![(32768, input_start..input_end)]) + ); + + // Ask for a sub-stripe quantity + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 16000), + ( + 32768, + vec![ + (16000, input_start..input_start.add(16000)), + (16000, input_start.add(16000)..input_start.add(32000)), + (768, input_start.add(32000)..input_end), + ] + ) + ); + + // Try on a range that starts slightly after our owned stripe + assert_eq!( + do_fragment(input_start.add(1), input_end, &shard_identity, 131072), + (32767, vec![(32767, input_start.add(1)..input_end)]) + ); + } + + /// Test our calculations work correctly when we start a range from the logical size key of + /// a previous relation. + #[test] + fn sharded_range_fragment_starting_from_logical_size() { + let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap(); + + // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x10000), + (0x8001, vec![(0x8001, input_start..input_end)]) + ); + + // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards + // store all logical sizes) + let shard_identity = ShardIdentity::new( + ShardNumber(1), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x10000), + (0x1, vec![(0x1, input_start..input_end)]) + ); + } + + /// Test that ShardedRange behaves properly when used on un-sharded data + #[test] + fn sharded_range_fragment_unsharded() { + let shard_identity = ShardIdentity::unsharded(); + + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x8000), + ( + 0x10000, + vec![ + (0x8000, input_start..input_start.add(0x8000)), + (0x8000, input_start.add(0x8000)..input_start.add(0x10000)) + ] + ) + ); + } + + #[test] + fn sharded_range_fragment_cross_relation() { + let shard_identity = ShardIdentity::unsharded(); + + // A range that spans relations: expect fragmentation to give up and return a u32::MAX size + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x8000), + (u32::MAX, vec![(u32::MAX, input_start..input_end),]) + ); + + // Same, but using a sharded identity + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x8000), + (u32::MAX, vec![(u32::MAX, input_start..input_end),]) + ); + } + + #[test] + fn sharded_range_fragment_tiny_nblocks() { + let shard_identity = ShardIdentity::unsharded(); + + // A range that spans relations: expect fragmentation to give up and return a u32::MAX size + let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap(); + let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 16), + ( + 0x38, + vec![ + (16, input_start..input_start.add(16)), + (16, input_start.add(16)..input_start.add(32)), + (16, input_start.add(32)..input_start.add(48)), + (8, input_start.add(48)..input_end), + ] + ) + ); + } + + #[test] + fn sharded_range_fragment_fuzz() { + // Use a fixed seed: we don't want to explicitly pick values, but we do want + // the test to be reproducible. + let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef); + + for _i in 0..1000 { + let shard_identity = if prng.next_u32() % 2 == 0 { + ShardIdentity::unsharded() + } else { + let shard_count = prng.next_u32() % 127 + 1; + ShardIdentity::new( + ShardNumber((prng.next_u32() % shard_count) as u8), + ShardCount::new(shard_count as u8), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap() + }; + + let target_nblocks = prng.next_u32() % 65536 + 1; + + let start_offset = prng.next_u32() % 16384; + + // Try ranges up to 4GiB in size, that are always at least 1 + let range_size = prng.next_u32() % 8192 + 1; + + // A range that spans relations: expect fragmentation to give up and return a u32::MAX size + let input_start = Key::from_hex("000000067F00000001000004E10000000000") + .unwrap() + .add(start_offset); + let input_end = input_start.add(range_size); + + // This test's main success conditions are the invariants baked into do_fragment + let (_total_size, fragments) = + do_fragment(input_start, input_end, &shard_identity, target_nblocks); + + // Pick a random key within the range and check it appears in the output + let example_key = input_start.add(prng.next_u32() % range_size); + + // Panic on unwrap if it isn't found + let example_key_frag = fragments + .iter() + .find(|f| f.1.contains(&example_key)) + .unwrap(); + + // Check that the fragment containing our random key has a nonzero size if + // that key is shard-local + let example_key_local = !shard_identity.is_key_disposable(&example_key); + if example_key_local { + assert!(example_key_frag.0 > 0); + } + } + } } diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs index 1b948d60c3..532185a366 100644 --- a/libs/pageserver_api/src/lib.rs +++ b/libs/pageserver_api/src/lib.rs @@ -1,6 +1,5 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] -use const_format::formatcp; pub mod controller_api; pub mod key; @@ -11,7 +10,4 @@ pub mod shard; /// Public API types pub mod upcall_api; -pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; -pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); -pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; -pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); +pub mod config; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index b4909f247f..a54cdb520d 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -303,6 +303,7 @@ pub struct TenantConfig { pub lazy_slru_download: Option, pub timeline_get_throttle: Option, pub image_layer_creation_check_threshold: Option, + pub switch_to_aux_file_v2: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -429,7 +430,6 @@ pub struct StatusResponse { #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantLocationConfigRequest { - pub tenant_id: Option, #[serde(flatten)] pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it } @@ -747,10 +747,18 @@ pub struct TimelineGcRequest { pub gc_horizon: Option, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalRedoManagerProcessStatus { + pub pid: u32, + /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`. + /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`. + pub kind: Cow<'static, str>, +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct WalRedoManagerStatus { pub last_redo_at: Option>, - pub pid: Option, + pub process: Option, } /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating @@ -772,6 +780,17 @@ pub struct SecondaryProgress { pub bytes_total: u64, } +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantScanRemoteStorageShard { + pub tenant_shard_id: TenantShardId, + pub generation: Option, +} + +#[derive(Serialize, Deserialize, Debug, Default)] +pub struct TenantScanRemoteStorageResponse { + pub shards: Vec, +} + pub mod virtual_file { #[derive( Copy, @@ -839,39 +858,72 @@ impl TryFrom for PagestreamBeMessageTag { } } +// In the V2 protocol version, a GetPage request contains two LSN values: +// +// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means +// "get the latest version present". It's used by the primary server, which knows that no one else +// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is +// Lsn::Max. Standby servers use the current replay LSN as the request LSN. +// +// not_modified_since: Hint to the pageserver that the client knows that the page has not been +// modified between 'not_modified_since' and the request LSN. It's always correct to set +// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but +// passing an earlier LSN can speed up the request, by allowing the pageserver to process the +// request without waiting for 'request_lsn' to arrive. +// +// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was +// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and +// 'latest' was set to true. The V2 interface was added because there was no correct way for a +// standby to request a page at a particular non-latest LSN, and also include the +// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the +// request, if the standby knows that the page hasn't been modified since, and risk getting an error +// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could +// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2 +// interface allows sending both LSNs, and let the pageserver do the right thing. There is no +// difference in the responses between V1 and V2. +// +// The Request structs below reflect the V2 interface. If V1 is used, the parse function +// maps the old format requests to the new format. +// +#[derive(Clone, Copy)] +pub enum PagestreamProtocolVersion { + V1, + V2, +} + #[derive(Debug, PartialEq, Eq)] pub struct PagestreamExistsRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub rel: RelTag, } #[derive(Debug, PartialEq, Eq)] pub struct PagestreamNblocksRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub rel: RelTag, } #[derive(Debug, PartialEq, Eq)] pub struct PagestreamGetPageRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub rel: RelTag, pub blkno: u32, } #[derive(Debug, PartialEq, Eq)] pub struct PagestreamDbSizeRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub dbnode: u32, } #[derive(Debug, PartialEq, Eq)] pub struct PagestreamGetSlruSegmentRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub kind: u8, pub segno: u32, } @@ -918,14 +970,16 @@ pub struct TenantHistorySize { } impl PagestreamFeMessage { + /// Serialize a compute -> pageserver message. This is currently only used in testing + /// tools. Always uses protocol version 2. pub fn serialize(&self) -> Bytes { let mut bytes = BytesMut::new(); match self { Self::Exists(req) => { bytes.put_u8(0); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -934,8 +988,8 @@ impl PagestreamFeMessage { Self::Nblocks(req) => { bytes.put_u8(1); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -944,8 +998,8 @@ impl PagestreamFeMessage { Self::GetPage(req) => { bytes.put_u8(2); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -955,15 +1009,15 @@ impl PagestreamFeMessage { Self::DbSize(req) => { bytes.put_u8(3); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.dbnode); } Self::GetSlruSegment(req) => { bytes.put_u8(4); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u8(req.kind); bytes.put_u32(req.segno); } @@ -972,18 +1026,40 @@ impl PagestreamFeMessage { bytes.into() } - pub fn parse(body: &mut R) -> anyhow::Result { - // TODO these gets can fail - + pub fn parse( + body: &mut R, + protocol_version: PagestreamProtocolVersion, + ) -> anyhow::Result { // these correspond to the NeonMessageTag enum in pagestore_client.h // // TODO: consider using protobuf or serde bincode for less error prone // serialization. let msg_tag = body.read_u8()?; + + let (request_lsn, not_modified_since) = match protocol_version { + PagestreamProtocolVersion::V2 => ( + Lsn::from(body.read_u64::()?), + Lsn::from(body.read_u64::()?), + ), + PagestreamProtocolVersion::V1 => { + // In the old protocol, each message starts with a boolean 'latest' flag, + // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and + // 'not_modified_since', used in the new protocol version. + let latest = body.read_u8()? != 0; + let request_lsn = Lsn::from(body.read_u64::()?); + if latest { + (Lsn::MAX, request_lsn) // get latest version + } else { + (request_lsn, request_lsn) // get version at specified LSN + } + } + }; + + // The rest of the messages are the same between V1 and V2 match msg_tag { 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -992,8 +1068,8 @@ impl PagestreamFeMessage { }, })), 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -1002,8 +1078,8 @@ impl PagestreamFeMessage { }, })), 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -1013,14 +1089,14 @@ impl PagestreamFeMessage { blkno: body.read_u32::()?, })), 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, dbnode: body.read_u32::()?, })), 4 => Ok(PagestreamFeMessage::GetSlruSegment( PagestreamGetSlruSegmentRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, kind: body.read_u8()?, segno: body.read_u32::()?, }, @@ -1148,8 +1224,8 @@ mod tests { // Test serialization/deserialization of PagestreamFeMessage let messages = vec![ PagestreamFeMessage::Exists(PagestreamExistsRequest { - latest: true, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(3), rel: RelTag { forknum: 1, spcnode: 2, @@ -1158,8 +1234,8 @@ mod tests { }, }), PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - latest: false, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(4), rel: RelTag { forknum: 1, spcnode: 2, @@ -1168,8 +1244,8 @@ mod tests { }, }), PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - latest: true, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(3), rel: RelTag { forknum: 1, spcnode: 2, @@ -1179,14 +1255,16 @@ mod tests { blkno: 7, }), PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - latest: true, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(3), dbnode: 7, }), ]; for msg in messages { let bytes = msg.serialize(); - let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap(); + let reconstructed = + PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2) + .unwrap(); assert!(msg == reconstructed); } } diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs index 0d287f7be0..f6644be635 100644 --- a/libs/pageserver_api/src/models/partitioning.rs +++ b/libs/pageserver_api/src/models/partitioning.rs @@ -1,9 +1,11 @@ use utils::lsn::Lsn; +use crate::keyspace::SparseKeySpace; + #[derive(Debug, PartialEq, Eq)] pub struct Partitioning { pub keys: crate::keyspace::KeySpace, - + pub sparse_keys: crate::keyspace::SparseKeySpace, pub at_lsn: Lsn, } @@ -32,6 +34,8 @@ impl serde::Serialize for Partitioning { let mut map = serializer.serialize_map(Some(2))?; map.serialize_key("keys")?; map.serialize_value(&KeySpace(&self.keys))?; + map.serialize_key("sparse_keys")?; + map.serialize_value(&KeySpace(&self.sparse_keys.0))?; map.serialize_key("at_lsn")?; map.serialize_value(&WithDisplay(&self.at_lsn))?; map.end() @@ -99,6 +103,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning { #[derive(serde::Deserialize)] struct De { keys: KeySpace, + sparse_keys: KeySpace, #[serde_as(as = "serde_with::DisplayFromStr")] at_lsn: Lsn, } @@ -107,6 +112,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning { Ok(Self { at_lsn: de.at_lsn, keys: de.keys.0, + sparse_keys: SparseKeySpace(de.sparse_keys.0), }) } } @@ -133,6 +139,12 @@ mod tests { "030000000000000000000000000000000003" ] ], + "sparse_keys": [ + [ + "620000000000000000000000000000000000", + "620000000000000000000000000000000003" + ] + ], "at_lsn": "0/2240160" } "#; diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index a2a9165184..ff6d3d91b6 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -5,21 +5,99 @@ use crate::{ models::ShardParameters, }; use hex::FromHex; +use postgres_ffi::relfile_utils::INIT_FORKNUM; use serde::{Deserialize, Serialize}; use utils::id::TenantId; +/// See docs/rfcs/031-sharding-static.md for an overview of sharding. +/// +/// This module contains a variety of types used to represent the concept of sharding +/// a Neon tenant across multiple physical shards. Since there are quite a few of these, +/// we provide an summary here. +/// +/// Types used to describe shards: +/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value +/// which identifies a tenant which is not shard-aware. This means its storage paths do not include +/// a shard suffix. +/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant. +/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId` +/// without the tenant ID. This is useful for things that are implicitly scoped to a particular +/// tenant, such as layer files. +/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient +/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read. +/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as +/// four hex digits. An unsharded tenant is `0000`. +/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant +/// +/// Types used to describe the parameters for data distribution in a sharded tenant: +/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across +/// multiple shards. Its value is given in 8kiB pages. +/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is +/// always zero: this is provided for future upgrades that might introduce different +/// data distribution schemes. +/// +/// Examples: +/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000 +/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001 +/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive), +/// and their slugs are 0004, 0104, 0204, and 0304. + #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardNumber(pub u8); #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardCount(u8); +/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant, +/// when we need to know which shard we're dealing with, but do not need to know the full +/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know +/// the fully qualified TenantShardId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct ShardIndex { + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + +/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`], +/// and to check whether that [`ShardNumber`] is the same as the current shard. +#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] +pub struct ShardIdentity { + pub number: ShardNumber, + pub count: ShardCount, + pub stripe_size: ShardStripeSize, + layout: ShardLayout, +} + +/// Formatting helper, for generating the `shard_id` label in traces. +struct ShardSlug<'a>(&'a TenantShardId); + +/// TenantShardId globally identifies a particular shard in a particular tenant. +/// +/// These are written as `-`, for example: +/// # The second shard in a two-shard tenant +/// 072f1291a5310026820b2fe4b2968934-0102 +/// +/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without +/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables +/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`]. +/// +/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs, +/// is both forward and backward compatible with TenantId: a legacy TenantId can be +/// decoded as a TenantShardId, and when re-encoded it will be parseable +/// as a TenantId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct TenantShardId { + pub tenant_id: TenantId, + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + impl ShardCount { pub const MAX: Self = Self(u8::MAX); /// The internal value of a ShardCount may be zero, which means "1 shard, but use /// legacy format for TenantShardId that excludes the shard suffix", also known - /// as `TenantShardId::unsharded`. + /// as [`TenantShardId::unsharded`]. /// /// This method returns the actual number of shards, i.e. if our internal value is /// zero, we return 1 (unsharded tenants have 1 shard). @@ -38,6 +116,9 @@ impl ShardCount { self.0 } + /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but + /// uses the legacy format for `TenantShardId`. See also the documentation for + /// [`Self::count`]. pub fn is_unsharded(&self) -> bool { self.0 == 0 } @@ -53,33 +134,6 @@ impl ShardNumber { pub const MAX: Self = Self(u8::MAX); } -/// TenantShardId identify the units of work for the Pageserver. -/// -/// These are written as `-`, for example: -/// -/// # The second shard in a two-shard tenant -/// 072f1291a5310026820b2fe4b2968934-0102 -/// -/// Historically, tenants could not have multiple shards, and were identified -/// by TenantId. To support this, TenantShardId has a special legacy -/// mode where `shard_count` is equal to zero: this represents a single-sharded -/// tenant which should be written as a TenantId with no suffix. -/// -/// The human-readable encoding of TenantShardId, such as used in API URLs, -/// is both forward and backward compatible: a legacy TenantId can be -/// decoded as a TenantShardId, and when re-encoded it will be parseable -/// as a TenantId. -/// -/// Note that the binary encoding is _not_ backward compatible, because -/// at the time sharding is introduced, there are no existing binary structures -/// containing TenantId that we need to handle. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct TenantShardId { - pub tenant_id: TenantId, - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} - impl TenantShardId { pub fn unsharded(tenant_id: TenantId) -> Self { Self { @@ -111,10 +165,13 @@ impl TenantShardId { } /// Convenience for code that has special behavior on the 0th shard. - pub fn is_zero(&self) -> bool { + pub fn is_shard_zero(&self) -> bool { self.shard_number == ShardNumber(0) } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() } @@ -150,9 +207,6 @@ impl TenantShardId { } } -/// Formatting helper -struct ShardSlug<'a>(&'a TenantShardId); - impl<'a> std::fmt::Display for ShardSlug<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -222,16 +276,6 @@ impl From<[u8; 18]> for TenantShardId { } } -/// For use within the context of a particular tenant, when we need to know which -/// shard we're dealing with, but do not need to know the full ShardIdentity (because -/// we won't be doing any page->shard mapping), and do not need to know the fully qualified -/// TenantShardId. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct ShardIndex { - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} - impl ShardIndex { pub fn new(number: ShardNumber, count: ShardCount) -> Self { Self { @@ -246,6 +290,9 @@ impl ShardIndex { } } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) } @@ -313,6 +360,8 @@ impl Serialize for TenantShardId { if serializer.is_human_readable() { serializer.collect_str(self) } else { + // Note: while human encoding of [`TenantShardId`] is backward and forward + // compatible, this binary encoding is not. let mut packed: [u8; 18] = [0; 18]; packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); packed[16] = self.shard_number.0; @@ -390,16 +439,6 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255); /// Default stripe size in pages: 256MiB divided by 8kiB page size. const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); -/// The ShardIdentity contains the information needed for one member of map -/// to resolve a key to a shard, and then check whether that shard is ==self. -#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] -pub struct ShardIdentity { - pub number: ShardNumber, - pub count: ShardCount, - pub stripe_size: ShardStripeSize, - layout: ShardLayout, -} - #[derive(thiserror::Error, Debug, PartialEq, Eq)] pub enum ShardConfigError { #[error("Invalid shard count")] @@ -414,7 +453,7 @@ impl ShardIdentity { /// An identity with number=0 count=0 is a "none" identity, which represents legacy /// tenants. Modern single-shard tenants should not use this: they should /// have number=0 count=1. - pub fn unsharded() -> Self { + pub const fn unsharded() -> Self { Self { number: ShardNumber(0), count: ShardCount(0), @@ -439,6 +478,9 @@ impl ShardIdentity { } } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.number == ShardNumber(0) && self.count == ShardCount(0) } @@ -487,6 +529,8 @@ impl ShardIdentity { } /// Return true if the key should be ingested by this shard + /// + /// Shards must ingest _at least_ keys which return true from this check. pub fn is_key_local(&self, key: &Key) -> bool { assert!(!self.is_broken()); if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) { @@ -497,7 +541,9 @@ impl ShardIdentity { } /// Return true if the key should be discarded if found in this shard's - /// data store, e.g. during compaction after a split + /// data store, e.g. during compaction after a split. + /// + /// Shards _may_ drop keys which return false here, but are not obliged to. pub fn is_key_disposable(&self, key: &Key) -> bool { if key_is_shard0(key) { // Q: Why can't we dispose of shard0 content if we're not shard 0? @@ -523,7 +569,7 @@ impl ShardIdentity { /// Convenience for checking if this identity is the 0th shard in a tenant, /// for special cases on shard 0 such as ingesting relation sizes. - pub fn is_zero(&self) -> bool { + pub fn is_shard_zero(&self) -> bool { self.number == ShardNumber(0) } } @@ -606,7 +652,13 @@ fn key_is_shard0(key: &Key) -> bool { // relation pages are distributed to shards other than shard zero. Everything else gets // stored on shard 0. This guarantees that shard 0 can independently serve basebackup // requests, and any request other than those for particular blocks in relations. - !is_rel_block_key(key) + // + // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table + // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0 + // because they must be included in basebackups. + let is_initfork = key.field5 == INIT_FORKNUM; + + !is_rel_block_key(key) || is_initfork } /// Provide the same result as the function in postgres `hashfn.h` with the same name diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index aa6845b9b1..0d6986778a 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -118,7 +118,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; pub use v14::bindings::{PageHeaderData, XLogRecord}; -pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; +pub use v14::xlog_utils::{ + XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, +}; pub use v14::bindings::{CheckPoint, ControlFileData}; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 4a66a0ab1d..0bbb91afc2 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -331,7 +331,10 @@ impl CheckPoint { /// Returns 'true' if the XID was updated. pub fn update_next_xid(&mut self, xid: u32) -> bool { // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround. - let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID); + let mut new_xid = std::cmp::max( + xid.wrapping_add(1), + pg_constants::FIRST_NORMAL_TRANSACTION_ID, + ); // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL. // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE new_xid = @@ -367,8 +370,16 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result 0 { + assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD); + // xlp_rem_len doesn't include page header, hence the subtraction. + ( + seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD, + pg_constants::XLP_FIRST_IS_CONTRECORD, + ) } else { (0, 0) }; @@ -397,20 +408,22 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result 0 { + assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64); + ( + (page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32, + pg_constants::XLP_FIRST_IS_CONTRECORD, + ) + } else { + (0, 0) + }; let header = XLogPageHeaderData { xlp_magic: XLOG_PAGE_MAGIC as u16, - xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 { - pg_constants::XLP_FIRST_IS_CONTRECORD - } else { - 0 - }, + xlp_info, xlp_tli: PG_TLI, xlp_pageaddr: lsn.page_lsn().0, - xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 { - page_off as u32 - } else { - 0u32 - }, + xlp_rem_len, ..Default::default() // Put 0 in padding fields. }; let hdr_bytes = header.encode()?; diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 23786e3b08..262068cbda 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -4,7 +4,9 @@ use log::*; use postgres::types::PgLsn; use postgres::Client; use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; -use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; +use postgres_ffi::{ + XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, +}; use std::path::{Path, PathBuf}; use std::process::Command; use std::time::{Duration, Instant}; @@ -262,11 +264,21 @@ fn craft_internal( intermediate_lsns.insert(0, initial_lsn); } - // Some records may be not flushed, e.g. non-transactional logical messages. + // Some records may be not flushed, e.g. non-transactional logical messages. Flush now. // - // Note: this is broken if pg_current_wal_insert_lsn is at page boundary - // because pg_current_wal_insert_lsn skips page headers. - client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?; + // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn + // returns the position just after the page header on the next page. That's where the next + // record will be inserted. But the page header hasn't actually been written to the WAL + // yet, and if you try to flush it, you get a "request to flush past end of generated WAL" + // error. Because of that, if the insert location is just after a page header, back off to + // previous page boundary. + let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?); + if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 { + lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64; + } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 { + lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64; + } + client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?; Ok(intermediate_lsns) } @@ -320,38 +332,49 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { client.execute("CREATE table t(x int)", &[])?; - // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. - // We will use logical message as the padding. We start with detecting how much WAL - // it takes for one logical message, considering all alignments and headers. - let base_wal_advance = { + // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. We + // will use carefully-sized logical messages to advance WAL insert location such + // that there is just enough space on the page for the XLOG_SWITCH record. + loop { + // We start with measuring how much WAL it takes for one logical message, + // considering all alignments and headers. let before_lsn = client.pg_current_wal_insert_lsn()?; - // Small non-empty message bigger than few bytes is more likely than an empty - // message to have the same format as the big padding message. client.execute( "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))", &[], )?; - // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD. - (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize - + XLOG_SIZE_OF_XLOG_RECORD - }; - let mut remaining_lsn = - XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ; - if remaining_lsn < base_wal_advance { - remaining_lsn += XLOG_BLCKSZ; + let after_lsn = client.pg_current_wal_insert_lsn()?; + + // Did the record cross a page boundary? If it did, start over. Crossing a + // page boundary adds to the apparent size of the record because of the page + // header, which throws off the calculation. + if u64::from(before_lsn) / XLOG_BLCKSZ as u64 + != u64::from(after_lsn) / XLOG_BLCKSZ as u64 + { + continue; + } + // base_size is the size of a logical message without the payload + let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10; + + // Is there enough space on the page for another logical message and an + // XLOG_SWITCH? If not, start over. + let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64; + if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 { + continue; + } + + // We will write another logical message, such that after the logical message + // record, there will be space for exactly one XLOG_SWITCH. How large should + // the logical message's payload be? An XLOG_SWITCH record has no data => its + // size is exactly XLOG_SIZE_OF_XLOG_RECORD. + let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64; + + client.execute( + "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", + &[&(repeats as i32)], + )?; + break; } - let repeats = 10 + remaining_lsn - base_wal_advance; - info!( - "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}", - client.pg_current_wal_insert_lsn()?, - remaining_lsn, - base_wal_advance, - repeats - ); - client.execute( - "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", - &[&(repeats as i32)], - )?; info!( "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", client.pg_current_wal_insert_lsn()?, diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 4a53f485ca..78da01c9a0 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -38,6 +38,7 @@ azure_storage_blobs.workspace = true futures-util.workspace = true http-types.workspace = true itertools.workspace = true +sync_wrapper = { workspace = true, features = ["futures"] } [dev-dependencies] camino-tempfile.workspace = true diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 5fff3e25c9..24c1248304 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -3,6 +3,7 @@ use std::borrow::Cow; use std::collections::HashMap; use std::env; +use std::io; use std::num::NonZeroU32; use std::pin::Pin; use std::str::FromStr; @@ -20,6 +21,7 @@ use azure_storage_blobs::blob::CopyStatus; use azure_storage_blobs::prelude::ClientBuilder; use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; use bytes::Bytes; +use futures::future::Either; use futures::stream::Stream; use futures_util::StreamExt; use futures_util::TryStreamExt; @@ -128,12 +130,12 @@ impl AzureBlobStorage { let kind = RequestKind::Get; let _permit = self.permit(kind, cancel).await?; + let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); + let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); let mut etag = None; let mut last_modified = None; let mut metadata = HashMap::new(); - // TODO give proper streaming response instead of buffering into RAM - // https://github.com/neondatabase/neon/issues/5563 let download = async { let response = builder @@ -152,39 +154,46 @@ impl AzureBlobStorage { Err(_elapsed) => Err(DownloadError::Timeout), }); - let mut response = std::pin::pin!(response); + let mut response = Box::pin(response); - let mut bufs = Vec::new(); - while let Some(part) = response.next().await { - let part = part?; - if etag.is_none() { - etag = Some(part.blob.properties.etag); - } - if last_modified.is_none() { - last_modified = Some(part.blob.properties.last_modified.into()); - } - if let Some(blob_meta) = part.blob.metadata { - metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned()))); - } - let data = part - .data - .collect() - .await - .map_err(|e| DownloadError::Other(e.into()))?; - bufs.push(data); - } - - if bufs.is_empty() { + let Some(part) = response.next().await else { return Err(DownloadError::Other(anyhow::anyhow!( - "Azure GET response contained no buffers" + "Azure GET response contained no response body" ))); + }; + let part = part?; + if etag.is_none() { + etag = Some(part.blob.properties.etag); } + if last_modified.is_none() { + last_modified = Some(part.blob.properties.last_modified.into()); + } + if let Some(blob_meta) = part.blob.metadata { + metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned()))); + } + // unwrap safety: if these were None, bufs would be empty and we would have returned an error already let etag = etag.unwrap(); let last_modified = last_modified.unwrap(); + let tail_stream = response + .map(|part| match part { + Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))), + Err(e) => { + Either::Right(futures::stream::once(async { Err(io::Error::other(e)) })) + } + }) + .flatten(); + let stream = part + .data + .map(|r| r.map_err(io::Error::other)) + .chain(sync_wrapper::SyncStream::new(tail_stream)); + //.chain(SyncStream::from_pin(Box::pin(tail_stream))); + + let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream); + Ok(Download { - download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))), + download_stream: Box::pin(download_stream), etag, last_modified, metadata: Some(StorageMetadata(metadata)), @@ -193,7 +202,10 @@ impl AzureBlobStorage { tokio::select! { bufs = download => bufs, - _ = cancel.cancelled() => Err(DownloadError::Cancelled), + cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout { + TimeoutOrCancel::Timeout => Err(DownloadError::Timeout), + TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled), + }, } } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index e708854be2..708662f20f 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -21,11 +21,13 @@ use std::{ fmt::Debug, num::{NonZeroU32, NonZeroUsize}, pin::Pin, + str::FromStr, sync::Arc, time::{Duration, SystemTime}, }; use anyhow::{bail, Context}; +use aws_sdk_s3::types::StorageClass; use camino::{Utf8Path, Utf8PathBuf}; use bytes::Bytes; @@ -53,11 +55,11 @@ pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests /// pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; -/// We set this a little bit low as we currently buffer the entire file into RAM +/// Set this limit analogously to the S3 limit /// /// Here, a limit of max 20k concurrent connections was noted. /// -pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30; +pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100; /// No limits on the client side, which currenltly means 1000 for AWS S3. /// pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; @@ -134,6 +136,11 @@ impl RemotePath { pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> { self.0.strip_prefix(&p.0) } + + pub fn add_trailing_slash(&self) -> Self { + // Unwrap safety inputs are guararnteed to be valid UTF-8 + Self(format!("{}/", self.0).try_into().unwrap()) + } } /// We don't need callers to be able to pass arbitrary delimiters: just control @@ -157,47 +164,21 @@ pub struct Listing { /// providing basic CRUD operations for storage files. #[allow(async_fn_in_trait)] pub trait RemoteStorage: Send + Sync + 'static { - /// Lists all top level subdirectories for a given prefix - /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id - /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS) - /// so this method doesnt need to. - async fn list_prefixes( - &self, - prefix: Option<&RemotePath>, - cancel: &CancellationToken, - ) -> Result, DownloadError> { - let result = self - .list(prefix, ListingMode::WithDelimiter, None, cancel) - .await? - .prefixes; - Ok(result) - } - /// Lists all files in directory "recursively" - /// (not really recursively, because AWS has a flat namespace) - /// Note: This is subtely different than list_prefixes, - /// because it is for listing files instead of listing - /// names sharing common prefixes. - /// For example, - /// list_files("foo/bar") = ["foo/bar/cat123.txt", - /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"] - /// whereas, - /// list_prefixes("foo/bar/") = ["cat", "dog"] - /// See `test_real_s3.rs` for more details. + /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2. + /// (see ``) + /// + /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not + /// from the absolute root of the bucket. + /// + /// `mode` configures whether to use a delimiter. Without a delimiter all keys + /// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of + /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are + /// returned in `keys` (). + /// + /// `max_keys` controls the maximum number of keys that will be returned. If this is None, this function + /// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on + /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure. /// - /// max_keys limits max number of keys returned; None means unlimited. - async fn list_files( - &self, - prefix: Option<&RemotePath>, - max_keys: Option, - cancel: &CancellationToken, - ) -> Result, DownloadError> { - let result = self - .list(prefix, ListingMode::NoDelimiter, max_keys, cancel) - .await? - .keys; - Ok(result) - } - async fn list( &self, prefix: Option<&RemotePath>, @@ -336,41 +317,6 @@ impl GenericRemoteStorage> { } } - // A function for listing all the files in a "directory" - // Example: - // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"] - // - // max_keys limits max number of keys returned; None means unlimited. - pub async fn list_files( - &self, - folder: Option<&RemotePath>, - max_keys: Option, - cancel: &CancellationToken, - ) -> Result, DownloadError> { - match self { - Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await, - Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await, - Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await, - Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await, - } - } - - // lists common *prefixes*, if any of files - // Example: - // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"] - pub async fn list_prefixes( - &self, - prefix: Option<&RemotePath>, - cancel: &CancellationToken, - ) -> Result, DownloadError> { - match self { - Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await, - Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await, - Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await, - Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await, - } - } - /// See [`RemoteStorage::upload`] pub async fn upload( &self, @@ -619,6 +565,7 @@ pub struct S3Config { /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. pub concurrency_limit: NonZeroUsize, pub max_keys_per_list_response: Option, + pub upload_storage_class: Option, } impl Debug for S3Config { @@ -747,6 +694,18 @@ impl RemoteStorageConfig { endpoint, concurrency_limit, max_keys_per_list_response, + upload_storage_class: toml + .get("upload_storage_class") + .map(|prefix_in_bucket| -> anyhow::Result<_> { + let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?; + let storage_class = StorageClass::from_str(&s).expect("infallible"); + #[allow(deprecated)] + if matches!(storage_class, StorageClass::Unknown(_)) { + bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values()); + } + Ok(storage_class) + }) + .transpose()?, }) } (_, _, _, Some(_), None) => { diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 8cad863731..1f7bcfc982 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -5,11 +5,9 @@ //! volume is mounted to the local FS. use std::{ - borrow::Cow, - future::Future, + collections::HashSet, io::ErrorKind, num::NonZeroU32, - pin::Pin, time::{Duration, SystemTime, UNIX_EPOCH}, }; @@ -22,11 +20,11 @@ use tokio::{ io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, }; use tokio_util::{io::ReaderStream, sync::CancellationToken}; -use tracing::*; -use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty}; +use utils::crashsafe::path_with_suffix_extension; use crate::{ Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel, + REMOTE_STORAGE_PREFIX_SEPARATOR, }; use super::{RemoteStorage, StorageMetadata}; @@ -93,7 +91,47 @@ impl LocalFs { #[cfg(test)] async fn list_all(&self) -> anyhow::Result> { - Ok(get_all_files(&self.storage_root, true) + use std::{future::Future, pin::Pin}; + fn get_all_files<'a, P>( + directory_path: P, + ) -> Pin>> + Send + Sync + 'a>> + where + P: AsRef + Send + Sync + 'a, + { + Box::pin(async move { + let directory_path = directory_path.as_ref(); + if directory_path.exists() { + if directory_path.is_dir() { + let mut paths = Vec::new(); + let mut dir_contents = fs::read_dir(directory_path).await?; + while let Some(dir_entry) = dir_contents.next_entry().await? { + let file_type = dir_entry.file_type().await?; + let entry_path = + Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| { + anyhow::Error::msg(format!( + "non-Unicode path: {}", + pb.to_string_lossy() + )) + })?; + if file_type.is_symlink() { + tracing::debug!("{entry_path:?} is a symlink, skipping") + } else if file_type.is_dir() { + paths.extend(get_all_files(&entry_path).await?.into_iter()) + } else { + paths.push(entry_path); + } + } + Ok(paths) + } else { + bail!("Path {directory_path:?} is not a directory") + } + } else { + Ok(Vec::new()) + } + }) + } + + Ok(get_all_files(&self.storage_root) .await? .into_iter() .map(|path| { @@ -120,6 +158,14 @@ impl LocalFs { // S3 object list prefixes can be arbitrary strings, but when reading // the local filesystem we need a directory to start calling read_dir on. let mut initial_dir = full_path.clone(); + + // If there's no trailing slash, we have to start looking from one above: even if + // `initial_dir` is a directory, we should still list any prefixes in the parent + // that start with the same string. + if !full_path.to_string().ends_with('/') { + initial_dir.pop(); + } + loop { // Did we make it to the root? if initial_dir.parent().is_none() { @@ -295,61 +341,66 @@ impl RemoteStorage for LocalFs { let op = async { let mut result = Listing::default(); - if let ListingMode::NoDelimiter = mode { - let keys = self - .list_recursive(prefix) - .await - .map_err(DownloadError::Other)?; - - result.keys = keys - .into_iter() - .filter(|k| { - let path = k.with_base(&self.storage_root); - !path.is_dir() - }) - .collect(); - - if let Some(max_keys) = max_keys { - result.keys.truncate(max_keys.get() as usize); - } - - return Ok(result); - } - - let path = match prefix { - Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)), - None => Cow::Borrowed(&self.storage_root), - }; - - let prefixes_to_filter = get_all_files(path.as_ref(), false) + // Filter out directories: in S3 directories don't exist, only the keys within them do. + let keys = self + .list_recursive(prefix) .await .map_err(DownloadError::Other)?; + let keys = keys + .into_iter() + .filter(|k| { + let path = k.with_base(&self.storage_root); + !path.is_dir() + }) + .collect(); - // filter out empty directories to mirror s3 behavior. - for prefix in prefixes_to_filter { - if prefix.is_dir() - && is_directory_empty(&prefix) - .await - .map_err(DownloadError::Other)? - { - continue; - } - - let stripped = prefix - .strip_prefix(&self.storage_root) - .context("Failed to strip prefix") - .and_then(RemotePath::new) - .expect( - "We list files for storage root, hence should be able to remote the prefix", - ); - - if prefix.is_dir() { - result.prefixes.push(stripped); - } else { - result.keys.push(stripped); + if let ListingMode::NoDelimiter = mode { + result.keys = keys; + } else { + let mut prefixes = HashSet::new(); + for key in keys { + // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`. + let relative_key = if let Some(prefix) = prefix { + let mut prefix = prefix.clone(); + // We only strip the dirname of the prefix, so that when we strip it from the start of keys we + // end up with full file/dir names. + let prefix_full_local_path = prefix.with_base(&self.storage_root); + let has_slash = prefix.0.to_string().ends_with('/'); + let strip_prefix = if prefix_full_local_path.is_dir() && has_slash { + prefix + } else { + prefix.0.pop(); + prefix + }; + + RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap() + } else { + key + }; + + let relative_key = format!("{}", relative_key); + if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) { + let first_part = relative_key + .split(REMOTE_STORAGE_PREFIX_SEPARATOR) + .next() + .unwrap() + .to_owned(); + prefixes.insert(first_part); + } else { + result + .keys + .push(RemotePath::from_string(&relative_key).unwrap()); + } } + result.prefixes = prefixes + .into_iter() + .map(|s| RemotePath::from_string(&s).unwrap()) + .collect(); } + if let Some(max_keys) = max_keys { + result.keys.truncate(max_keys.get() as usize); + } Ok(result) }; @@ -560,50 +611,6 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf { path_with_suffix_extension(original_path, "metadata") } -fn get_all_files<'a, P>( - directory_path: P, - recursive: bool, -) -> Pin>> + Send + Sync + 'a>> -where - P: AsRef + Send + Sync + 'a, -{ - Box::pin(async move { - let directory_path = directory_path.as_ref(); - if directory_path.exists() { - if directory_path.is_dir() { - let mut paths = Vec::new(); - let mut dir_contents = fs::read_dir(directory_path).await?; - while let Some(dir_entry) = dir_contents.next_entry().await? { - let file_type = dir_entry.file_type().await?; - let entry_path = - Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| { - anyhow::Error::msg(format!( - "non-Unicode path: {}", - pb.to_string_lossy() - )) - })?; - if file_type.is_symlink() { - debug!("{entry_path:?} is a symlink, skipping") - } else if file_type.is_dir() { - if recursive { - paths.extend(get_all_files(&entry_path, true).await?.into_iter()) - } else { - paths.push(entry_path) - } - } else { - paths.push(entry_path); - } - } - Ok(paths) - } else { - bail!("Path {directory_path:?} is not a directory") - } - } else { - Ok(Vec::new()) - } - }) -} - async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> { let target_dir = match target_file_path.parent() { Some(parent_dir) => parent_dir, @@ -923,13 +930,18 @@ mod fs_tests { // No delimiter: should recursively list everything let (storage, cancel) = create_storage()?; let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?; + let child_sibling = + upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?; let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?; let listing = storage .list(None, ListingMode::NoDelimiter, None, &cancel) .await?; assert!(listing.prefixes.is_empty()); - assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec()); + assert_eq!( + listing.keys.into_iter().collect::>(), + HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()]) + ); // Delimiter: should only go one deep let listing = storage @@ -942,7 +954,25 @@ mod fs_tests { ); assert!(listing.keys.is_empty()); - // Delimiter & prefix + // Delimiter & prefix with a trailing slash + let listing = storage + .list( + Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await?; + assert_eq!( + listing.keys, + [RemotePath::from_string("uncle").unwrap()].to_vec() + ); + assert_eq!( + listing.prefixes, + [RemotePath::from_string("parent").unwrap()].to_vec() + ); + + // Delimiter and prefix without a trailing slash let listing = storage .list( Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()), @@ -951,12 +981,66 @@ mod fs_tests { &cancel, ) .await?; + assert_eq!(listing.keys, [].to_vec()); assert_eq!( listing.prefixes, - [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()] - .to_vec() + [RemotePath::from_string("grandparent").unwrap()].to_vec() + ); + + // Delimiter and prefix that's partway through a path component + let listing = storage + .list( + Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await?; + assert_eq!(listing.keys, [].to_vec()); + assert_eq!( + listing.prefixes, + [RemotePath::from_string("grandparent").unwrap()].to_vec() + ); + + Ok(()) + } + + #[tokio::test] + async fn list_part_component() -> anyhow::Result<()> { + // No delimiter: should recursively list everything + let (storage, cancel) = create_storage()?; + + // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing + // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as + // a freeform prefix. + let _child_a = + upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?; + let _child_b = + upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?; + + // Delimiter and prefix that's partway through a path component + let listing = storage + .list( + Some( + &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(), + ), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await?; + assert_eq!(listing.keys, [].to_vec()); + + let mut found_prefixes = listing.prefixes.clone(); + found_prefixes.sort(); + assert_eq!( + found_prefixes, + [ + RemotePath::from_string("tenant").unwrap(), + RemotePath::from_string("tenant-01").unwrap(), + ] + .to_vec() ); - assert_eq!(listing.keys, [uncle.clone()].to_vec()); Ok(()) } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 1cb85cfb1b..c0b89cee2a 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -30,7 +30,7 @@ use aws_sdk_s3::{ config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep}, error::SdkError, operation::get_object::GetObjectError, - types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion}, + types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}, Client, }; use aws_smithy_async::rt::sleep::TokioSleep; @@ -62,6 +62,7 @@ pub struct S3Bucket { bucket_name: String, prefix_in_bucket: Option, max_keys_per_list_response: Option, + upload_storage_class: Option, concurrency_limiter: ConcurrencyLimiter, // Per-request timeout. Accessible for tests. pub timeout: Duration, @@ -154,6 +155,7 @@ impl S3Bucket { max_keys_per_list_response: aws_config.max_keys_per_list_response, prefix_in_bucket, concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()), + upload_storage_class: aws_config.upload_storage_class.clone(), timeout, }) } @@ -178,10 +180,7 @@ impl S3Bucket { pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String { assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); - let path_string = path - .get_path() - .as_str() - .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR); + let path_string = path.get_path().as_str(); match &self.prefix_in_bucket { Some(prefix) => prefix.clone() + "/" + path_string, None => path_string.to_string(), @@ -471,16 +470,11 @@ impl RemoteStorage for S3Bucket { // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix .map(|p| self.relative_path_to_s3_object(p)) - .or_else(|| self.prefix_in_bucket.clone()) - .map(|mut p| { - // required to end with a separator - // otherwise request will return only the entry of a prefix - if matches!(mode, ListingMode::WithDelimiter) - && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) - { - p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); - } - p + .or_else(|| { + self.prefix_in_bucket.clone().map(|mut s| { + s.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + s + }) }); let _permit = self.permit(kind, cancel).await?; @@ -549,11 +543,15 @@ impl RemoteStorage for S3Bucket { } } - result.prefixes.extend( - prefixes - .iter() - .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))), - ); + // S3 gives us prefixes like "foo/", we return them like "foo" + result.prefixes.extend(prefixes.iter().filter_map(|o| { + Some( + self.s3_object_to_relative_path( + o.prefix()? + .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR), + ), + ) + })); continuation_token = match response.next_continuation_token { Some(new_token) => Some(new_token), @@ -586,6 +584,7 @@ impl RemoteStorage for S3Bucket { .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(to)) .set_metadata(metadata.map(|m| m.0)) + .set_storage_class(self.upload_storage_class.clone()) .content_length(from_size_bytes.try_into()?) .body(bytes_stream) .send(); @@ -637,6 +636,7 @@ impl RemoteStorage for S3Bucket { .copy_object() .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(to)) + .set_storage_class(self.upload_storage_class.clone()) .copy_source(copy_source) .send(); @@ -894,6 +894,7 @@ impl RemoteStorage for S3Bucket { .copy_object() .bucket(self.bucket_name.clone()) .key(key) + .set_storage_class(self.upload_storage_class.clone()) .copy_source(&source_id) .send(); @@ -1050,22 +1051,22 @@ mod tests { Some("/test/prefix/"), ]; let expected_outputs = [ - vec!["", "some/path", "some/path"], - vec!["/", "/some/path", "/some/path"], + vec!["", "some/path", "some/path/"], + vec!["/", "/some/path", "/some/path/"], vec![ "test/prefix/", "test/prefix/some/path", - "test/prefix/some/path", + "test/prefix/some/path/", ], vec![ "test/prefix/", "test/prefix/some/path", - "test/prefix/some/path", + "test/prefix/some/path/", ], vec![ "test/prefix/", "test/prefix/some/path", - "test/prefix/some/path", + "test/prefix/some/path/", ], ]; @@ -1077,6 +1078,7 @@ mod tests { endpoint: None, concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response: Some(5), + upload_storage_class: None, }; let storage = S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init"); diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index f5344d3ae2..c467a2d196 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -107,27 +107,6 @@ impl UnreliableWrapper { type VoidStorage = crate::LocalFs; impl RemoteStorage for UnreliableWrapper { - async fn list_prefixes( - &self, - prefix: Option<&RemotePath>, - cancel: &CancellationToken, - ) -> Result, DownloadError> { - self.attempt(RemoteOp::ListPrefixes(prefix.cloned())) - .map_err(DownloadError::Other)?; - self.inner.list_prefixes(prefix, cancel).await - } - - async fn list_files( - &self, - folder: Option<&RemotePath>, - max_keys: Option, - cancel: &CancellationToken, - ) -> Result, DownloadError> { - self.attempt(RemoteOp::ListPrefixes(folder.cloned())) - .map_err(DownloadError::Other)?; - self.inner.list_files(folder, max_keys, cancel).await - } - async fn list( &self, prefix: Option<&RemotePath>, diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index 72f6f956e0..673151c8ef 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -1,5 +1,6 @@ use anyhow::Context; use camino::Utf8Path; +use remote_storage::ListingMode; use remote_storage::RemotePath; use std::sync::Arc; use std::{collections::HashSet, num::NonZeroU32}; @@ -54,9 +55,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix)) .context("common_prefix construction")?; let root_remote_prefixes = test_client - .list_prefixes(None, &cancel) - .await - .context("client list root prefixes failure")? + .list(None, ListingMode::WithDelimiter, None, &cancel) + .await? + .prefixes .into_iter() .collect::>(); assert_eq!( @@ -65,9 +66,14 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a ); let nested_remote_prefixes = test_client - .list_prefixes(Some(&base_prefix), &cancel) - .await - .context("client list nested prefixes failure")? + .list( + Some(&base_prefix.add_trailing_slash()), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await? + .prefixes .into_iter() .collect::>(); let remote_only_prefixes = nested_remote_prefixes @@ -90,11 +96,13 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a /// /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`] /// Then performs the following queries: -/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` -/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` +/// 1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` +/// 2. `list("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)] #[tokio::test] -async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> { +async fn list_no_delimiter_works( + ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs, +) -> anyhow::Result<()> { let ctx = match ctx { MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx, MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()), @@ -107,29 +115,36 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a let base_prefix = RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?; let root_files = test_client - .list_files(None, None, &cancel) + .list(None, ListingMode::NoDelimiter, None, &cancel) .await .context("client list root files failure")? + .keys .into_iter() .collect::>(); assert_eq!( root_files, ctx.remote_blobs.clone(), - "remote storage list_files on root mismatches with the uploads." + "remote storage list on root mismatches with the uploads." ); // Test that max_keys limit works. In total there are about 21 files (see // upload_simple_remote_data call in test_real_s3.rs). let limited_root_files = test_client - .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel) + .list( + None, + ListingMode::NoDelimiter, + Some(NonZeroU32::new(2).unwrap()), + &cancel, + ) .await .context("client list root files failure")?; - assert_eq!(limited_root_files.len(), 2); + assert_eq!(limited_root_files.keys.len(), 2); let nested_remote_files = test_client - .list_files(Some(&base_prefix), None, &cancel) + .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel) .await .context("client list nested files failure")? + .keys .into_iter() .collect::>(); let trim_remote_blobs: HashSet<_> = ctx @@ -141,7 +156,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a .collect(); assert_eq!( nested_remote_files, trim_remote_blobs, - "remote storage list_files on subdirrectory mismatches with the uploads." + "remote storage list on subdirrectory mismatches with the uploads." ); Ok(()) } @@ -199,7 +214,11 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<( ctx.client.delete_objects(&[path1, path2], &cancel).await?; - let prefixes = ctx.client.list_prefixes(None, &cancel).await?; + let prefixes = ctx + .client + .list(None, ListingMode::WithDelimiter, None, &cancel) + .await? + .prefixes; assert_eq!(prefixes.len(), 1); diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 6aa02868e6..cd0b2be4b5 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -132,10 +132,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { } } -// NOTE: the setups for the list_prefixes test and the list_files test are very similar -// However, they are not idential. The list_prefixes function is concerned with listing prefixes, -// whereas the list_files function is concerned with listing files. -// See `RemoteStorage::list_files` documentation for more details enum MaybeEnabledStorageWithSimpleTestBlobs { Enabled(AzureWithSimpleTestBlobs), Disabled, diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index c5d5216f00..a273abe867 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -12,8 +12,8 @@ use anyhow::Context; use camino::Utf8Path; use futures_util::StreamExt; use remote_storage::{ - DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, - S3Config, + DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig, + RemoteStorageKind, S3Config, }; use test_context::test_context; use test_context::AsyncTestContext; @@ -75,11 +75,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: client: &Arc, cancel: &CancellationToken, ) -> anyhow::Result> { - Ok(retry(|| client.list_files(None, None, cancel)) - .await - .context("list root files failure")? - .into_iter() - .collect::>()) + Ok( + retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel)) + .await + .context("list root files failure")? + .keys + .into_iter() + .collect::>(), + ) } let cancel = CancellationToken::new(); @@ -294,10 +297,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { } } -// NOTE: the setups for the list_prefixes test and the list_files test are very similar -// However, they are not idential. The list_prefixes function is concerned with listing prefixes, -// whereas the list_files function is concerned with listing files. -// See `RemoteStorage::list_files` documentation for more details enum MaybeEnabledStorageWithSimpleTestBlobs { Enabled(S3WithSimpleTestBlobs), Disabled, @@ -381,6 +380,7 @@ fn create_s3_client( endpoint: None, concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, + upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index af15cee924..b703e883de 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -34,6 +34,8 @@ pub enum Generation { /// scenarios where pageservers might otherwise issue conflicting writes to /// remote storage impl Generation { + pub const MAX: Self = Self::Valid(u32::MAX); + /// Create a new Generation that represents a legacy key format with /// no generation suffix pub fn none() -> Self { diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index b09350d11e..2953f0aad4 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -92,6 +92,8 @@ pub mod zstd; pub mod env; +pub mod poison; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs new file mode 100644 index 0000000000..0bf5664f47 --- /dev/null +++ b/libs/utils/src/poison.rs @@ -0,0 +1,121 @@ +//! Protect a piece of state from reuse after it is left in an inconsistent state. +//! +//! # Example +//! +//! ``` +//! # tokio_test::block_on(async { +//! use utils::poison::Poison; +//! use std::time::Duration; +//! +//! struct State { +//! clean: bool, +//! } +//! let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true })); +//! +//! let mut mutex_guard = state.lock().await; +//! let mut poison_guard = mutex_guard.check_and_arm()?; +//! let state = poison_guard.data_mut(); +//! state.clean = false; +//! // If we get cancelled at this await point, subsequent check_and_arm() calls will fail. +//! tokio::time::sleep(Duration::from_secs(10)).await; +//! state.clean = true; +//! poison_guard.disarm(); +//! # Ok::<(), utils::poison::Error>(()) +//! # }); +//! ``` + +use tracing::warn; + +pub struct Poison { + what: &'static str, + state: State, + data: T, +} + +#[derive(Clone, Copy)] +enum State { + Clean, + Armed, + Poisoned { at: chrono::DateTime }, +} + +impl Poison { + /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed. + pub fn new(what: &'static str, data: T) -> Self { + Self { + what, + state: State::Clean, + data, + } + } + + /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state. + pub fn check_and_arm(&mut self) -> Result, Error> { + match self.state { + State::Clean => { + self.state = State::Armed; + Ok(Guard(self)) + } + State::Armed => unreachable!("transient state"), + State::Poisoned { at } => Err(Error::Poisoned { + what: self.what, + at, + }), + } + } +} + +/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state. +/// Once modifications are done, use [`Self::disarm`]. +/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned +/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error. +pub struct Guard<'a, T>(&'a mut Poison); + +impl<'a, T> Guard<'a, T> { + pub fn data(&self) -> &T { + &self.0.data + } + pub fn data_mut(&mut self) -> &mut T { + &mut self.0.data + } + + pub fn disarm(self) { + match self.0.state { + State::Clean => unreachable!("we set it to Armed in check_and_arm()"), + State::Armed => { + self.0.state = State::Clean; + } + State::Poisoned { at } => { + unreachable!("we fail check_and_arm() if it's in that state: {at}") + } + } + } +} + +impl<'a, T> Drop for Guard<'a, T> { + fn drop(&mut self) { + match self.0.state { + State::Clean => { + // set by disarm() + } + State::Armed => { + // still armed => poison it + let at = chrono::Utc::now(); + self.0.state = State::Poisoned { at }; + warn!(at=?at, "poisoning {}", self.0.what); + } + State::Poisoned { at } => { + unreachable!("we fail check_and_arm() if it's in that state: {at}") + } + } + } +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("poisoned at {at}: {what}")] + Poisoned { + what: &'static str, + at: chrono::DateTime, + }, +} diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index 0544c5be03..375b227b99 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -2,11 +2,10 @@ use std::cmp::{Eq, Ordering}; use std::collections::BinaryHeap; -use std::fmt::Debug; use std::mem; use std::sync::Mutex; use std::time::Duration; -use tokio::sync::watch::{channel, Receiver, Sender}; +use tokio::sync::watch::{self, channel}; use tokio::time::timeout; /// An error happened while waiting for a number @@ -35,23 +34,73 @@ pub trait MonotonicCounter { fn cnt_value(&self) -> V; } -/// Internal components of a `SeqWait` -struct SeqWaitInt +/// Heap of waiters, lowest numbers pop first. +struct Waiters where - S: MonotonicCounter, V: Ord, { - waiters: BinaryHeap>, - current: S, - shutdown: bool, + heap: BinaryHeap>, + /// Number of the first waiter in the heap, or None if there are no waiters. + status_channel: watch::Sender>, +} + +impl Waiters +where + V: Ord + Copy, +{ + fn new() -> Self { + Waiters { + heap: BinaryHeap::new(), + status_channel: channel(None).0, + } + } + + /// `status_channel` contains the number of the first waiter in the heap. + /// This function should be called whenever waiters heap changes. + fn update_status(&self) { + let first_waiter = self.heap.peek().map(|w| w.wake_num); + let _ = self.status_channel.send_replace(first_waiter); + } + + /// Add new waiter to the heap, return a channel that will be notified when the number arrives. + fn add(&mut self, num: V) -> watch::Receiver<()> { + let (tx, rx) = channel(()); + self.heap.push(Waiter { + wake_num: num, + wake_channel: tx, + }); + self.update_status(); + rx + } + + /// Pop all waiters <= num from the heap. Collect channels in a vector, + /// so that caller can wake them up. + fn pop_leq(&mut self, num: V) -> Vec> { + let mut wake_these = Vec::new(); + while let Some(n) = self.heap.peek() { + if n.wake_num > num { + break; + } + wake_these.push(self.heap.pop().unwrap().wake_channel); + } + self.update_status(); + wake_these + } + + /// Used on shutdown to efficiently drop all waiters. + fn take_all(&mut self) -> BinaryHeap> { + let heap = mem::take(&mut self.heap); + self.update_status(); + heap + } } struct Waiter where T: Ord, { - wake_num: T, // wake me when this number arrives ... - wake_channel: Sender<()>, // ... by sending a message to this channel + wake_num: T, // wake me when this number arrives ... + wake_channel: watch::Sender<()>, // ... by sending a message to this channel } // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here @@ -76,6 +125,17 @@ impl PartialEq for Waiter { impl Eq for Waiter {} +/// Internal components of a `SeqWait` +struct SeqWaitInt +where + S: MonotonicCounter, + V: Ord, +{ + waiters: Waiters, + current: S, + shutdown: bool, +} + /// A tool for waiting on a sequence number /// /// This provides a way to wait the arrival of a number. @@ -108,7 +168,7 @@ where /// Create a new `SeqWait`, initialized to a particular number pub fn new(starting_num: S) -> Self { let internal = SeqWaitInt { - waiters: BinaryHeap::new(), + waiters: Waiters::new(), current: starting_num, shutdown: false, }; @@ -128,9 +188,8 @@ where // Block any future waiters from starting internal.shutdown = true; - // This will steal the entire waiters map. - // When we drop it all waiters will be woken. - mem::take(&mut internal.waiters) + // Take all waiters to drop them later. + internal.waiters.take_all() // Drop the lock as we exit this scope. }; @@ -196,7 +255,7 @@ where /// Register and return a channel that will be notified when a number arrives, /// or None, if it has already arrived. - fn queue_for_wait(&self, num: V) -> Result>, SeqWaitError> { + fn queue_for_wait(&self, num: V) -> Result>, SeqWaitError> { let mut internal = self.internal.lock().unwrap(); if internal.current.cnt_value() >= num { return Ok(None); @@ -205,12 +264,8 @@ where return Err(SeqWaitError::Shutdown); } - // Create a new channel. - let (tx, rx) = channel(()); - internal.waiters.push(Waiter { - wake_num: num, - wake_channel: tx, - }); + // Add waiter channel to the queue. + let rx = internal.waiters.add(num); // Drop the lock as we exit this scope. Ok(Some(rx)) } @@ -231,16 +286,8 @@ where } internal.current.cnt_advance(num); - // Pop all waiters <= num from the heap. Collect them in a vector, and - // wake them up after releasing the lock. - let mut wake_these = Vec::new(); - while let Some(n) = internal.waiters.peek() { - if n.wake_num > num { - break; - } - wake_these.push(internal.waiters.pop().unwrap().wake_channel); - } - wake_these + // Pop all waiters <= num from the heap. + internal.waiters.pop_leq(num) }; for tx in wake_these { @@ -255,6 +302,23 @@ where pub fn load(&self) -> S { self.internal.lock().unwrap().current } + + /// Get a Receiver for the current status. + /// + /// The current status is the number of the first waiter in the queue, + /// or None if there are no waiters. + /// + /// This receiver will be notified whenever the status changes. + /// It is useful for receiving notifications when the first waiter + /// starts waiting for a number, or when there are no more waiters left. + pub fn status_receiver(&self) -> watch::Receiver> { + self.internal + .lock() + .unwrap() + .waiters + .status_channel + .subscribe() + } } #[cfg(test)] diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 8eee1f72a6..1abd3d9861 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -192,6 +192,14 @@ impl OnceCell { } } + /// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never + /// initialized. + pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> { + let inner = self.inner.get_mut().unwrap(); + + inner.take_and_deinit() + } + /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete. pub fn initializer_count(&self) -> usize { self.initializers.load(Ordering::Relaxed) @@ -246,15 +254,23 @@ impl<'a, T> Guard<'a, T> { /// The permit will be on a semaphore part of the new internal value, and any following /// [`OnceCell::get_or_init`] will wait on it to complete. pub fn take_and_deinit(mut self) -> (T, InitPermit) { + self.0 + .take_and_deinit() + .expect("guard is not created unless value has been initialized") + } +} + +impl Inner { + pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> { + let value = self.value.take()?; + let mut swapped = Inner::default(); let sem = swapped.init_semaphore.clone(); // acquire and forget right away, moving the control over to InitPermit sem.try_acquire().expect("we just created this").forget(); - std::mem::swap(&mut *self.0, &mut swapped); - swapped - .value - .map(|v| (v, InitPermit(sem))) - .expect("guard is not created unless value has been initialized") + let permit = InitPermit(sem); + std::mem::swap(self, &mut swapped); + Some((value, permit)) } } @@ -263,6 +279,13 @@ impl<'a, T> Guard<'a, T> { /// On drop, this type will return the permit. pub struct InitPermit(Arc); +impl std::fmt::Debug for InitPermit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let ptr = Arc::as_ptr(&self.0) as *const (); + f.debug_tuple("InitPermit").field(&ptr).finish() + } +} + impl Drop for InitPermit { fn drop(&mut self) { assert_eq!( @@ -559,4 +582,22 @@ mod tests { assert_eq!(*target.get().unwrap(), 11); } + + #[tokio::test] + async fn take_and_deinit_on_mut() { + use std::convert::Infallible; + + let mut target = OnceCell::::default(); + assert!(target.take_and_deinit().is_none()); + + target + .get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) }) + .await + .unwrap(); + + let again = target.take_and_deinit(); + assert!(matches!(again, Some((42, _))), "{again:?}"); + + assert!(target.take_and_deinit().is_none()); + } } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index e82aee2462..0f1a7fe688 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -69,6 +69,7 @@ tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true +twox-hash.workspace = true url.workspace = true walkdir.workspace = true metrics.workspace = true diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index ffe607be4b..5b871c5d5e 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -27,30 +27,50 @@ //! //! # Reference Numbers //! -//! 2024-04-04 on i3en.3xlarge +//! 2024-04-15 on i3en.3xlarge //! //! ```text -//! short/1 time: [25.925 µs 26.060 µs 26.209 µs] -//! short/2 time: [31.277 µs 31.483 µs 31.722 µs] -//! short/4 time: [45.496 µs 45.831 µs 46.182 µs] -//! short/8 time: [84.298 µs 84.920 µs 85.566 µs] -//! short/16 time: [185.04 µs 186.41 µs 187.88 µs] -//! short/32 time: [385.01 µs 386.77 µs 388.70 µs] -//! short/64 time: [770.24 µs 773.04 µs 776.04 µs] -//! short/128 time: [1.5017 ms 1.5064 ms 1.5113 ms] -//! medium/1 time: [106.65 µs 107.20 µs 107.85 µs] -//! medium/2 time: [153.28 µs 154.24 µs 155.56 µs] -//! medium/4 time: [325.67 µs 327.01 µs 328.71 µs] -//! medium/8 time: [646.82 µs 650.17 µs 653.91 µs] -//! medium/16 time: [1.2645 ms 1.2701 ms 1.2762 ms] -//! medium/32 time: [2.4409 ms 2.4550 ms 2.4692 ms] -//! medium/64 time: [4.6814 ms 4.7114 ms 4.7408 ms] -//! medium/128 time: [8.7790 ms 8.9037 ms 9.0282 ms] +//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs] +//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs] +//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs] +//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs] +//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs] +//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs] +//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs] +//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms] +//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs] +//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs] +//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs] +//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs] +//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms] +//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms] +//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms] +//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms] +//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs] +//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs] +//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs] +//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs] +//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs] +//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs] +//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs] +//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms] +//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs] +//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs] +//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs] +//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs] +//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms] +//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms] +//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms] +//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms] //! ``` use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; -use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager}; +use pageserver::{ + config::PageServerConf, + walrecord::NeonWalRecord, + walredo::{PostgresRedoManager, ProcessKind}, +}; use pageserver_api::{key::Key, shard::TenantShardId}; use std::{ sync::Arc, @@ -60,33 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet}; use utils::{id::TenantId, lsn::Lsn}; fn bench(c: &mut Criterion) { - { - let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; - for nclients in nclients { - let mut group = c.benchmark_group("short"); - group.bench_with_input( - BenchmarkId::from_parameter(nclients), - &nclients, - |b, nclients| { - let redo_work = Arc::new(Request::short_input()); - b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); - }, - ); + for process_kind in &[ProcessKind::Async, ProcessKind::Sync] { + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group(format!("{process_kind}-short")); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::short_input()); + b.iter_custom(|iters| { + bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients) + }); + }, + ); + } } - } - { - let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; - for nclients in nclients { - let mut group = c.benchmark_group("medium"); - group.bench_with_input( - BenchmarkId::from_parameter(nclients), - &nclients, - |b, nclients| { - let redo_work = Arc::new(Request::medium_input()); - b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); - }, - ); + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group(format!("{process_kind}-medium")); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::medium_input()); + b.iter_custom(|iters| { + bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients) + }); + }, + ); + } } } } @@ -94,10 +120,16 @@ criterion::criterion_group!(benches, bench); criterion::criterion_main!(benches); // Returns the sum of each client's wall-clock time spent executing their share of the n_redos. -fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration { +fn bench_impl( + process_kind: ProcessKind, + redo_work: Arc, + n_redos: u64, + nclients: u64, +) -> Duration { let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap(); - let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); + let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); + conf.walredo_process_kind = process_kind; let conf = Box::leak(Box::new(conf)); let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); @@ -113,25 +145,40 @@ fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration let manager = PostgresRedoManager::new(conf, tenant_shard_id); let manager = Arc::new(manager); + // divide the amount of work equally among the clients. + let nredos_per_client = n_redos / nclients; for _ in 0..nclients { rt.block_on(async { tasks.spawn(client( Arc::clone(&manager), Arc::clone(&start), Arc::clone(&redo_work), - // divide the amount of work equally among the clients - n_redos / nclients, + nredos_per_client, )) }); } - rt.block_on(async move { - let mut total_wallclock_time = std::time::Duration::from_millis(0); + let elapsed = rt.block_on(async move { + let mut total_wallclock_time = Duration::ZERO; while let Some(res) = tasks.join_next().await { total_wallclock_time += res.unwrap(); } total_wallclock_time - }) + }); + + // consistency check to ensure process kind setting worked + if nredos_per_client > 0 { + assert_eq!( + manager + .status() + .process + .map(|p| p.kind) + .expect("the benchmark work causes a walredo process to be spawned"), + std::borrow::Cow::Borrowed(process_kind.into()) + ); + } + + elapsed } async fn client( diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 3c9982ffb8..6df8b2170d 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -243,6 +243,19 @@ impl Client { Ok(()) } + pub async fn tenant_scan_remote_storage( + &self, + tenant_id: TenantId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_id}/scan_remote_storage", + self.mgmt_api_endpoint + ); + let response = self.request(Method::GET, &uri, ()).await?; + let body = response.json().await.map_err(Error::ReceiveBody)?; + Ok(body) + } + pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> { let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint); self.request(Method::PUT, &uri, req).await?; @@ -271,6 +284,34 @@ impl Client { Ok((status, progress)) } + pub async fn tenant_secondary_status( + &self, + tenant_shard_id: TenantShardId, + ) -> Result { + let path = reqwest::Url::parse(&format!( + "{}/v1/tenant/{}/secondary/status", + self.mgmt_api_endpoint, tenant_shard_id + )) + .expect("Cannot build URL"); + + self.request(Method::GET, path, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> { + let path = reqwest::Url::parse(&format!( + "{}/v1/tenant/{}/heatmap_upload", + self.mgmt_api_endpoint, tenant_id + )) + .expect("Cannot build URL"); + + self.request(Method::POST, path, ()).await?; + Ok(()) + } + pub async fn location_config( &self, tenant_shard_id: TenantShardId, @@ -278,10 +319,7 @@ impl Client { flush_ms: Option, lazy: bool, ) -> Result<()> { - let req_body = TenantLocationConfigRequest { - tenant_id: Some(tenant_shard_id), - config, - }; + let req_body = TenantLocationConfigRequest { config }; let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/location_config", diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index 49175b3b90..f9507fc47a 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -60,7 +60,7 @@ impl Client { ) -> anyhow::Result { let copy_both: tokio_postgres::CopyBothDuplex = self .client - .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}")) + .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}")) .await?; let Client { cancel_on_client_drop, diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 5261746b22..137b93055a 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -18,6 +18,7 @@ //! database size. For example, if the logical database size is 10 GB, we would //! generate new image layers every 10 GB of WAL. use futures::StreamExt; +use pageserver_api::shard::ShardIdentity; use tracing::{debug, info}; use std::collections::{HashSet, VecDeque}; @@ -125,6 +126,7 @@ async fn compact_level( } let mut state = LevelCompactionState { + shard_identity: *executor.get_shard_identity(), target_file_size, _lsn_range: lsn_range.clone(), layers: layer_fragments, @@ -164,6 +166,8 @@ struct LevelCompactionState<'a, E> where E: CompactionJobExecutor, { + shard_identity: ShardIdentity, + // parameters target_file_size: u64, @@ -366,6 +370,7 @@ where .executor .get_keyspace(&job.key_range, job.lsn_range.end, ctx) .await?, + &self.shard_identity, ) * 8192; let wal_size = job @@ -430,7 +435,7 @@ where keyspace, self.target_file_size / 8192, ); - while let Some(key_range) = window.choose_next_image() { + while let Some(key_range) = window.choose_next_image(&self.shard_identity) { new_jobs.push(CompactionJob:: { key_range, lsn_range: job.lsn_range.clone(), @@ -623,7 +628,12 @@ impl KeyspaceWindowPos { } // Advance the cursor until it reaches 'target_keysize'. - fn advance_until_size(&mut self, w: &KeyspaceWindowHead, max_size: u64) { + fn advance_until_size( + &mut self, + w: &KeyspaceWindowHead, + max_size: u64, + shard_identity: &ShardIdentity, + ) { while self.accum_keysize < max_size && !self.reached_end(w) { let curr_range = &w.keyspace[self.keyspace_idx]; if self.end_key < curr_range.start { @@ -632,7 +642,7 @@ impl KeyspaceWindowPos { } // We're now within 'curr_range'. Can we advance past it completely? - let distance = K::key_range_size(&(self.end_key..curr_range.end)); + let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity); if (self.accum_keysize + distance as u64) < max_size { // oh yeah, it fits self.end_key = curr_range.end; @@ -641,7 +651,7 @@ impl KeyspaceWindowPos { } else { // advance within the range let skip_key = self.end_key.skip_some(); - let distance = K::key_range_size(&(self.end_key..skip_key)); + let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity); if (self.accum_keysize + distance as u64) < max_size { self.end_key = skip_key; self.accum_keysize += distance as u64; @@ -677,7 +687,7 @@ where } } - fn choose_next_image(&mut self) -> Option> { + fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option> { if self.start_pos.keyspace_idx == self.head.keyspace.len() { // we've reached the end return None; @@ -687,6 +697,7 @@ where next_pos.advance_until_size( &self.head, self.start_pos.accum_keysize + self.head.target_keysize, + shard_identity, ); // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to @@ -695,6 +706,7 @@ where end_pos.advance_until_size( &self.head, self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4), + shard_identity, ); if end_pos.reached_end(&self.head) { // gobble up any unused keyspace between the last used key and end of the range diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 9de6363d6e..1b80373ba7 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -5,6 +5,7 @@ use crate::interface::*; use futures::future::BoxFuture; use futures::{Stream, StreamExt}; use itertools::Itertools; +use pageserver_api::shard::ShardIdentity; use pin_project_lite::pin_project; use std::collections::BinaryHeap; use std::collections::VecDeque; @@ -13,11 +14,17 @@ use std::ops::{DerefMut, Range}; use std::pin::Pin; use std::task::{ready, Poll}; -pub fn keyspace_total_size(keyspace: &CompactionKeySpace) -> u64 +pub fn keyspace_total_size( + keyspace: &CompactionKeySpace, + shard_identity: &ShardIdentity, +) -> u64 where K: CompactionKey, { - keyspace.iter().map(|r| K::key_range_size(r) as u64).sum() + keyspace + .iter() + .map(|r| K::key_range_size(r, shard_identity) as u64) + .sum() } pub fn overlaps_with(a: &Range, b: &Range) -> bool { diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 5dc62e506f..35519b5d0a 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -4,7 +4,7 @@ //! All the heavy lifting is done by the create_image and create_delta //! functions that the implementor provides. use futures::Future; -use pageserver_api::{key::Key, keyspace::key_range_size}; +use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity}; use std::ops::Range; use utils::lsn::Lsn; @@ -32,6 +32,8 @@ pub trait CompactionJobExecutor { // Functions that the planner uses to support its decisions // ---- + fn get_shard_identity(&self) -> &ShardIdentity; + /// Return all layers that overlap the given bounding box. fn get_layers( &mut self, @@ -98,7 +100,7 @@ pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display { /// /// This returns u32, for compatibility with Repository::key. If the /// distance is larger, return u32::MAX. - fn key_range_size(key_range: &Range) -> u32; + fn key_range_size(key_range: &Range, shard_identity: &ShardIdentity) -> u32; // return "self + 1" fn next(&self) -> Self; @@ -113,8 +115,8 @@ impl CompactionKey for Key { const MIN: Self = Self::MIN; const MAX: Self = Self::MAX; - fn key_range_size(r: &std::ops::Range) -> u32 { - key_range_size(r) + fn key_range_size(r: &std::ops::Range, shard_identity: &ShardIdentity) -> u32 { + ShardedRange::new(r.clone(), shard_identity).page_count() } fn next(&self) -> Key { (self as &Key).next() diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index 6c00df3a65..3543df64fa 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -3,6 +3,7 @@ mod draw; use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; use futures::StreamExt; +use pageserver_api::shard::ShardIdentity; use rand::Rng; use tracing::info; @@ -71,7 +72,7 @@ impl interface::CompactionKey for Key { const MIN: Self = u64::MIN; const MAX: Self = u64::MAX; - fn key_range_size(key_range: &Range) -> u32 { + fn key_range_size(key_range: &Range, _shard_identity: &ShardIdentity) -> u32 { std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32 } @@ -434,6 +435,11 @@ impl interface::CompactionJobExecutor for MockTimeline { type ImageLayer = Arc; type RequestContext = MockRequestContext; + fn get_shard_identity(&self) -> &ShardIdentity { + static IDENTITY: ShardIdentity = ShardIdentity::unsharded(); + &IDENTITY + } + async fn get_layers( &mut self, key_range: &Range, diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index c5cd451e8d..843f5dd862 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -12,9 +12,14 @@ bytes.workspace = true camino.workspace = true clap = { workspace = true, features = ["string"] } git-version.workspace = true +humantime.workspace = true pageserver = { path = ".." } +pageserver_api.workspace = true +remote_storage = { path = "../../libs/remote_storage" } postgres_ffi.workspace = true tokio.workspace = true +tokio-util.workspace = true +toml_edit.workspace = true utils.workspace = true svg_fmt.workspace = true workspace_hack.workspace = true diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 0e77ef0563..9a556cb3d4 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -9,18 +9,45 @@ //! Coordinates in both axis are compressed for better readability. //! (see ) //! -//! Example use: +//! The plain text API was chosen so that we can easily work with filenames from various +//! sources; see the Usage section below for examples. +//! +//! # Usage +//! +//! ## Producing the SVG +//! //! ```bash -//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ -//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg -//! $ firefox out.svg +//! +//! # local timeline dir +//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ +//! grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg +//! +//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer` +//! (jq -r '.historic_layers[] | .layer_file_name' | cargo run -p pagectl draw-timeline) < layer-map.json > out.svg +//! +//! # From an `index_part.json` in S3 +//! (jq -r '.layer_metadata | keys[]' | cargo run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg +//! //! ``` //! -//! This API was chosen so that we can easily work with filenames extracted from ssh, -//! or from pageserver log files. +//! ## Viewing //! -//! TODO Consider shipping this as a grafana panel plugin: -//! +//! **Inkscape** is better than the built-in viewers in browsers. +//! +//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X) +//! to see the layer file name in the comment field. +//! +//! ```bash +//! +//! # Linux +//! inkscape out.svg +//! +//! # macOS +//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg +//! +//! ``` +//! + use anyhow::Result; use pageserver::repository::Key; use pageserver::METADATA_FILE_NAME; @@ -65,7 +92,12 @@ fn parse_filename(name: &str) -> (Range, Range) { pub fn main() -> Result<()> { // Parse layer filenames from stdin - let mut ranges: Vec<(Range, Range)> = vec![]; + struct Layer { + filename: String, + key_range: Range, + lsn_range: Range, + } + let mut files: Vec = vec![]; let stdin = io::stdin(); for line in stdin.lock().lines() { let line = line.unwrap(); @@ -76,14 +108,23 @@ pub fn main() -> Result<()> { // Don't try and parse "metadata" like a key-lsn range continue; } - let range = parse_filename(filename); - ranges.push(range); + let (key_range, lsn_range) = parse_filename(filename); + files.push(Layer { + filename: filename.to_owned(), + key_range, + lsn_range, + }); } // Collect all coordinates let mut keys: Vec = vec![]; let mut lsns: Vec = vec![]; - for (keyr, lsnr) in &ranges { + for Layer { + key_range: keyr, + lsn_range: lsnr, + .. + } in &files + { keys.push(keyr.start); keys.push(keyr.end); lsns.push(lsnr.start); @@ -107,7 +148,12 @@ pub fn main() -> Result<()> { h: stretch * lsn_map.len() as f32 } ); - for (keyr, lsnr) in &ranges { + for Layer { + filename, + key_range: keyr, + lsn_range: lsnr, + } in &files + { let key_start = *key_map.get(&keyr.start).unwrap(); let key_end = *key_map.get(&keyr.end).unwrap(); let key_diff = key_end - key_start; @@ -151,6 +197,7 @@ pub fn main() -> Result<()> { .fill(fill) .stroke(Stroke::Color(rgb(0, 0, 0), 0.1)) .border_radius(0.4) + .comment(filename) ); } println!("{}", EndSvg); diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index e73d961e36..1fb75584fc 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -9,6 +9,11 @@ mod index_part; mod layer_map_analyzer; mod layers; +use std::{ + str::FromStr, + time::{Duration, SystemTime}, +}; + use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; use index_part::IndexPartCmd; @@ -20,8 +25,16 @@ use pageserver::{ tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, virtual_file, }; +use pageserver_api::shard::TenantShardId; use postgres_ffi::ControlFileData; -use utils::{lsn::Lsn, project_git_version}; +use remote_storage::{RemotePath, RemoteStorageConfig}; +use tokio_util::sync::CancellationToken; +use utils::{ + id::TimelineId, + logging::{self, LogFormat, TracingErrorLayerEnablement}, + lsn::Lsn, + project_git_version, +}; project_git_version!(GIT_VERSION); @@ -43,6 +56,7 @@ enum Commands { #[command(subcommand)] IndexPart(IndexPartCmd), PrintLayerFile(PrintLayerFileCmd), + TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd), DrawTimeline {}, AnalyzeLayerMap(AnalyzeLayerMapCmd), #[command(subcommand)] @@ -68,6 +82,26 @@ struct PrintLayerFileCmd { path: Utf8PathBuf, } +/// Roll back the time for the specified prefix using S3 history. +/// +/// The command is fairly low level and powerful. Validation is only very light, +/// so it is more powerful, and thus potentially more dangerous. +#[derive(Parser)] +struct TimeTravelRemotePrefixCmd { + /// A configuration string for the remote_storage configuration. + /// + /// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }` + config_toml_str: String, + /// remote prefix to time travel recover. For safety reasons, we require it to contain + /// a timeline or tenant ID in the prefix. + prefix: String, + /// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy. + travel_to: String, + /// Timestamp of the start of the operation, must be after any changes we want to roll back and after. + /// You can use a few seconds before invoking the command. Same format as `travel_to`. + done_if_after: Option, +} + #[derive(Parser)] struct AnalyzeLayerMapCmd { /// Pageserver data path @@ -78,6 +112,14 @@ struct AnalyzeLayerMapCmd { #[tokio::main] async fn main() -> anyhow::Result<()> { + logging::init( + LogFormat::Plain, + TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + )?; + + logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let cli = CliOpts::parse(); match cli.command { @@ -105,6 +147,42 @@ async fn main() -> anyhow::Result<()> { print_layerfile(&cmd.path).await?; } } + Commands::TimeTravelRemotePrefix(cmd) => { + let timestamp = humantime::parse_rfc3339(&cmd.travel_to) + .map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?; + + let done_if_after = if let Some(done_if_after) = &cmd.done_if_after { + humantime::parse_rfc3339(done_if_after).map_err(|_e| { + anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after) + })? + } else { + const SAFETY_MARGIN: Duration = Duration::from_secs(3); + tokio::time::sleep(SAFETY_MARGIN).await; + // Convert to string representation and back to get rid of sub-second values + let done_if_after = SystemTime::now(); + tokio::time::sleep(SAFETY_MARGIN).await; + done_if_after + }; + + let timestamp = strip_subsecond(timestamp); + let done_if_after = strip_subsecond(done_if_after); + + let Some(prefix) = validate_prefix(&cmd.prefix) else { + println!("specified prefix '{}' failed validation", cmd.prefix); + return Ok(()); + }; + let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?; + let toml_item = toml_document + .get("remote_storage") + .expect("need remote_storage"); + let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config"); + let storage = remote_storage::GenericRemoteStorage::from_config(&config); + let cancel = CancellationToken::new(); + storage + .unwrap() + .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel) + .await?; + } }; Ok(()) } @@ -185,3 +263,89 @@ fn handle_metadata( Ok(()) } + +/// Ensures that the given S3 prefix is sufficiently constrained. +/// The command is very risky already and we don't want to expose something +/// that allows usually unintentional and quite catastrophic time travel of +/// an entire bucket, which would be a major catastrophy and away +/// by only one character change (similar to "rm -r /home /username/foobar"). +fn validate_prefix(prefix: &str) -> Option { + if prefix.is_empty() { + // Empty prefix means we want to specify the *whole* bucket + return None; + } + let components = prefix.split('/').collect::>(); + let (last, components) = { + let last = components.last()?; + if last.is_empty() { + ( + components.iter().nth_back(1)?, + &components[..(components.len() - 1)], + ) + } else { + (last, &components[..]) + } + }; + 'valid: { + if let Ok(_timeline_id) = TimelineId::from_str(last) { + // Ends in either a tenant or timeline ID + break 'valid; + } + if *last == "timelines" { + if let Some(before_last) = components.iter().nth_back(1) { + if let Ok(_tenant_id) = TenantShardId::from_str(before_last) { + // Has a valid tenant id + break 'valid; + } + } + } + + return None; + } + RemotePath::from_string(prefix).ok() +} + +fn strip_subsecond(timestamp: SystemTime) -> SystemTime { + let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string(); + humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate_prefix() { + assert_eq!(validate_prefix(""), None); + assert_eq!(validate_prefix("/"), None); + #[track_caller] + fn assert_valid(prefix: &str) { + let remote_path = RemotePath::from_string(prefix).unwrap(); + assert_eq!(validate_prefix(prefix), Some(remote_path)); + } + assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"); + // Path is not relative but absolute + assert_eq!( + validate_prefix( + "/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/" + ), + None + ); + assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/"); + // Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix + assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None); + assert_eq!(validate_prefix("wal"), None); + assert_eq!(validate_prefix("/wal/"), None); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001"); + // Partial tenant ID + assert_eq!( + validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"), + None + ); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683"); + assert_eq!(validate_prefix("pageserver/v1/tenants/"), None); + } +} diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index c3d8e61a2c..5043a207fc 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -312,8 +312,12 @@ async fn main_impl( let (rel_tag, block_no) = key_to_rel_block(key).expect("we filter non-rel-block keys out above"); PagestreamGetPageRequest { - latest: rng.gen_bool(args.req_latest_probability), - lsn: r.timeline_lsn, + request_lsn: if rng.gen_bool(args.req_latest_probability) { + Lsn::MAX + } else { + r.timeline_lsn + }, + not_modified_since: r.timeline_lsn, rel: rel_tag, blkno: block_no, } diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs new file mode 100644 index 0000000000..a343acaf7a --- /dev/null +++ b/pageserver/src/aux_file.rs @@ -0,0 +1,112 @@ +use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE}; +use tracing::warn; + +/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash]. +fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key { + let mut key = [0; METADATA_KEY_SIZE]; + let hash = twox_hash::xxh3::hash128(data).to_be_bytes(); + key[0] = AUX_KEY_PREFIX; + key[1] = dir_level1; + key[2] = dir_level2; + key[3..16].copy_from_slice(&hash[0..13]); + Key::from_metadata_key_fixed_size(&key) +} + +const AUX_DIR_PG_LOGICAL: u8 = 0x01; +const AUX_DIR_PG_REPLSLOT: u8 = 0x02; +const AUX_DIR_PG_UNKNOWN: u8 = 0xFF; + +/// Encode the aux file into a fixed-size key. +/// +/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type. +/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path +/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix +/// is roughly based on the first two components of the path, one unique number for one component. +/// +/// * pg_logical/mappings -> 0x0101 +/// * pg_logical/snapshots -> 0x0102 +/// * pg_logical/replorigin_checkpoint -> 0x0103 +/// * pg_logical/others -> 0x01FF +/// * pg_replslot/ -> 0x0201 +/// * others -> 0xFFFF +/// +/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`. +/// The new file type must have never been written to the storage before. Otherwise, there could be data +/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix. +pub fn encode_aux_file_key(path: &str) -> Key { + if let Some(fname) = path.strip_prefix("pg_logical/mappings/") { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes()) + } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes()) + } else if path == "pg_logical/replorigin_checkpoint" { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"") + } else if let Some(fname) = path.strip_prefix("pg_logical/") { + if cfg!(debug_assertions) { + warn!( + "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning", + path + ); + } + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes()) + } else if let Some(fname) = path.strip_prefix("pg_replslot/") { + aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes()) + } else { + if cfg!(debug_assertions) { + warn!( + "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning", + path + ); + } + aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_hash_portable() { + // AUX file encoding requires the hash to be portable across all platforms. This test case checks + // if the algorithm produces the same hash across different environments. + assert_eq!( + 305317690835051308206966631765527126151, + twox_hash::xxh3::hash128("test1".as_bytes()) + ); + assert_eq!( + 85104974691013376326742244813280798847, + twox_hash::xxh3::hash128("test/test2".as_bytes()) + ); + assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes())); + } + + #[test] + fn test_encoding_portable() { + // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions + // of the page server. + assert_eq!( + "6200000101E5B20C5F8DD5AA3289D6D9EAFA", + encode_aux_file_key("pg_logical/mappings/test1").to_string() + ); + assert_eq!( + "620000010239AAC544893139B26F501B97E6", + encode_aux_file_key("pg_logical/snapshots/test2").to_string() + ); + assert_eq!( + "620000010300000000000000000000000000", + encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string() + ); + assert_eq!( + "62000001FF8635AF2134B7266EC5B4189FD6", + encode_aux_file_key("pg_logical/unsupported").to_string() + ); + assert_eq!( + "6200000201772D0E5D71DE14DA86142A1619", + encode_aux_file_key("pg_replslot/test3").to_string() + ); + assert_eq!( + "620000FFFF1866EBEB53B807B26A2416F317", + encode_aux_file_key("other_file_not_supported").to_string() + ); + } +} diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 0479d05f8f..58b18dae7d 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,7 +10,7 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{anyhow, bail, ensure, Context}; +use anyhow::{anyhow, Context}; use bytes::{BufMut, Bytes, BytesMut}; use fail::fail_point; use pageserver_api::key::{key_to_slru_block, Key}; @@ -38,6 +38,14 @@ use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; +#[derive(Debug, thiserror::Error)] +pub enum BasebackupError { + #[error("basebackup pageserver error {0:#}")] + Server(#[from] anyhow::Error), + #[error("basebackup client error {0:#}")] + Client(#[source] io::Error), +} + /// Create basebackup with non-rel data in it. /// Only include relational data if 'full_backup' is true. /// @@ -53,7 +61,7 @@ pub async fn send_basebackup_tarball<'a, W>( prev_lsn: Option, full_backup: bool, ctx: &'a RequestContext, -) -> anyhow::Result<()> +) -> Result<(), BasebackupError> where W: AsyncWrite + Send + Sync + Unpin, { @@ -92,8 +100,10 @@ where // Consolidate the derived and the provided prev_lsn values let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn { - if backup_prev != Lsn(0) { - ensure!(backup_prev == provided_prev_lsn); + if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn { + return Err(BasebackupError::Server(anyhow!( + "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}" + ))); } provided_prev_lsn } else { @@ -159,15 +169,26 @@ where } } - async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> { + async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> { let (kind, segno, _) = key_to_slru_block(*key)?; match kind { SlruKind::Clog => { - ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8); + if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) { + return Err(BasebackupError::Server(anyhow!( + "invalid SlruKind::Clog record: block.len()={}", + block.len() + ))); + } } SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => { - ensure!(block.len() == BLCKSZ as usize); + if block.len() != BLCKSZ as usize { + return Err(BasebackupError::Server(anyhow!( + "invalid {:?} record: block.len()={}", + kind, + block.len() + ))); + } } } @@ -194,12 +215,15 @@ where Ok(()) } - async fn flush(&mut self) -> anyhow::Result<()> { + async fn flush(&mut self) -> Result<(), BasebackupError> { let nblocks = self.buf.len() / BLCKSZ as usize; let (kind, segno) = self.current_segment.take().unwrap(); let segname = format!("{}/{:>04X}", kind.to_str(), segno); let header = new_tar_header(&segname, self.buf.len() as u64)?; - self.ar.append(&header, self.buf.as_slice()).await?; + self.ar + .append(&header, self.buf.as_slice()) + .await + .map_err(BasebackupError::Client)?; self.total_blocks += nblocks; debug!("Added to basebackup slru {} relsize {}", segname, nblocks); @@ -209,7 +233,7 @@ where Ok(()) } - async fn finish(mut self) -> anyhow::Result<()> { + async fn finish(mut self) -> Result<(), BasebackupError> { let res = if self.current_segment.is_none() || self.buf.is_empty() { Ok(()) } else { @@ -226,7 +250,7 @@ impl<'a, W> Basebackup<'a, W> where W: AsyncWrite + Send + Sync + Unpin, { - async fn send_tarball(mut self) -> anyhow::Result<()> { + async fn send_tarball(mut self) -> Result<(), BasebackupError> { // TODO include checksum let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; @@ -262,16 +286,25 @@ where let slru_partitions = self .timeline .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx) - .await? - .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64); + .await + .map_err(|e| BasebackupError::Server(e.into()))? + .partition( + self.timeline.get_shard_identity(), + Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + ); let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar); for part in slru_partitions.parts { - let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?; + let blocks = self + .timeline + .get_vectored(part, self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; for (key, block) in blocks { - slru_builder.add_block(&key, block?).await?; + let block = block.map_err(|e| BasebackupError::Server(e.into()))?; + slru_builder.add_block(&key, block).await?; } } slru_builder.finish().await?; @@ -279,8 +312,11 @@ where let mut min_restart_lsn: Lsn = Lsn::MAX; // Create tablespace directories - for ((spcnode, dbnode), has_relmap_file) in - self.timeline.list_dbdirs(self.lsn, self.ctx).await? + for ((spcnode, dbnode), has_relmap_file) in self + .timeline + .list_dbdirs(self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))? { self.add_dbdir(spcnode, dbnode, has_relmap_file).await?; @@ -289,7 +325,8 @@ where let rels = self .timeline .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await?; + .await + .map_err(|e| BasebackupError::Server(e.into()))?; for &rel in rels.iter() { // Send init fork as main fork to provide well formed empty // contents of UNLOGGED relations. Postgres copies it in @@ -312,7 +349,12 @@ where } } - for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? { + for (path, content) in self + .timeline + .list_aux_files(self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))? + { if path.starts_with("pg_replslot") { let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; let restart_lsn = Lsn(u64::from_le_bytes( @@ -343,34 +385,41 @@ where for xid in self .timeline .list_twophase_files(self.lsn, self.ctx) - .await? + .await + .map_err(|e| BasebackupError::Server(e.into()))? { self.add_twophase_file(xid).await?; } fail_point!("basebackup-before-control-file", |_| { - bail!("failpoint basebackup-before-control-file") + Err(BasebackupError::Server(anyhow!( + "failpoint basebackup-before-control-file" + ))) }); // Generate pg_control and bootstrap WAL segment. self.add_pgcontrol_file().await?; - self.ar.finish().await?; + self.ar.finish().await.map_err(BasebackupError::Client)?; debug!("all tarred up!"); Ok(()) } /// Add contents of relfilenode `src`, naming it as `dst`. - async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> { + async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> { let nblocks = self .timeline - .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx) - .await?; + .get_rel_size(src, Version::Lsn(self.lsn), self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; // If the relation is empty, create an empty file if nblocks == 0 { let file_name = dst.to_segfile_name(0); let header = new_tar_header(&file_name, 0)?; - self.ar.append(&header, &mut io::empty()).await?; + self.ar + .append(&header, &mut io::empty()) + .await + .map_err(BasebackupError::Client)?; return Ok(()); } @@ -384,14 +433,18 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx) - .await?; + .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; segment_data.extend_from_slice(&img[..]); } let file_name = dst.to_segfile_name(seg as u32); let header = new_tar_header(&file_name, segment_data.len() as u64)?; - self.ar.append(&header, segment_data.as_slice()).await?; + self.ar + .append(&header, segment_data.as_slice()) + .await + .map_err(BasebackupError::Client)?; seg += 1; startblk = endblk; @@ -411,20 +464,22 @@ where spcnode: u32, dbnode: u32, has_relmap_file: bool, - ) -> anyhow::Result<()> { + ) -> Result<(), BasebackupError> { let relmap_img = if has_relmap_file { let img = self .timeline .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await?; + .await + .map_err(|e| BasebackupError::Server(e.into()))?; - ensure!( - img.len() - == dispatch_pgversion!( - self.timeline.pg_version, - pgv::bindings::SIZEOF_RELMAPFILE - ) - ); + if img.len() + != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE) + { + return Err(BasebackupError::Server(anyhow!( + "img.len() != SIZE_OF_RELMAPFILE, img.len()={}", + img.len(), + ))); + } Some(img) } else { @@ -437,14 +492,20 @@ where ver => format!("{ver}\x0A"), }; let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; - self.ar.append(&header, pg_version_str.as_bytes()).await?; + self.ar + .append(&header, pg_version_str.as_bytes()) + .await + .map_err(BasebackupError::Client)?; info!("timeline.pg_version {}", self.timeline.pg_version); if let Some(img) = relmap_img { // filenode map for global tablespace let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?; - self.ar.append(&header, &img[..]).await?; + self.ar + .append(&header, &img[..]) + .await + .map_err(BasebackupError::Client)?; } else { warn!("global/pg_filenode.map is missing"); } @@ -463,18 +524,26 @@ where && self .timeline .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await? + .await + .map_err(|e| BasebackupError::Server(e.into()))? .is_empty() { return Ok(()); } // User defined tablespaces are not supported - ensure!(spcnode == DEFAULTTABLESPACE_OID); + if spcnode != DEFAULTTABLESPACE_OID { + return Err(BasebackupError::Server(anyhow!( + "spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}" + ))); + } // Append dir path for each database let path = format!("base/{}", dbnode); let header = new_tar_header_dir(&path)?; - self.ar.append(&header, &mut io::empty()).await?; + self.ar + .append(&header, &mut io::empty()) + .await + .map_err(BasebackupError::Client)?; if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); @@ -484,11 +553,17 @@ where ver => format!("{ver}\x0A"), }; let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; - self.ar.append(&header, pg_version_str.as_bytes()).await?; + self.ar + .append(&header, pg_version_str.as_bytes()) + .await + .map_err(BasebackupError::Client)?; let relmap_path = format!("base/{}/pg_filenode.map", dbnode); let header = new_tar_header(&relmap_path, img.len() as u64)?; - self.ar.append(&header, &img[..]).await?; + self.ar + .append(&header, &img[..]) + .await + .map_err(BasebackupError::Client)?; } }; Ok(()) @@ -497,11 +572,12 @@ where // // Extract twophase state files // - async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { + async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> { let img = self .timeline .get_twophase_file(xid, self.lsn, self.ctx) - .await?; + .await + .map_err(|e| BasebackupError::Server(e.into()))?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); @@ -509,7 +585,10 @@ where buf.put_u32_le(crc); let path = format!("pg_twophase/{:>08X}", xid); let header = new_tar_header(&path, buf.len() as u64)?; - self.ar.append(&header, &buf[..]).await?; + self.ar + .append(&header, &buf[..]) + .await + .map_err(BasebackupError::Client)?; Ok(()) } @@ -518,24 +597,28 @@ where // Add generated pg_control file and bootstrap WAL segment. // Also send zenith.signal file with extra bootstrap data. // - async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { + async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> { // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { if self.lsn == self.timeline.get_ancestor_lsn() { - write!(zenith_signal, "PREV LSN: none")?; + write!(zenith_signal, "PREV LSN: none") + .map_err(|e| BasebackupError::Server(e.into()))?; } else { - write!(zenith_signal, "PREV LSN: invalid")?; + write!(zenith_signal, "PREV LSN: invalid") + .map_err(|e| BasebackupError::Server(e.into()))?; } } else { - write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?; + write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn) + .map_err(|e| BasebackupError::Server(e.into()))?; } self.ar .append( &new_tar_header("zenith.signal", zenith_signal.len() as u64)?, zenith_signal.as_bytes(), ) - .await?; + .await + .map_err(BasebackupError::Client)?; let checkpoint_bytes = self .timeline @@ -557,7 +640,10 @@ where //send pg_control let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; - self.ar.append(&header, &pg_control_bytes[..]).await?; + self.ar + .append(&header, &pg_control_bytes[..]) + .await + .map_err(BasebackupError::Client)?; //send wal segment let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); @@ -572,8 +658,16 @@ where self.lsn, ) .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; - ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); - self.ar.append(&header, &wal_seg[..]).await?; + if wal_seg.len() != WAL_SEGMENT_SIZE { + return Err(BasebackupError::Server(anyhow!( + "wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}", + wal_seg.len() + ))); + } + self.ar + .append(&header, &wal_seg[..]) + .await + .map_err(BasebackupError::Client)?; Ok(()) } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 0903b206ff..1345223a43 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -121,8 +121,10 @@ fn main() -> anyhow::Result<()> { &[("node_id", &conf.id.to_string())], ); - // after setting up logging, log the effective IO engine choice + // after setting up logging, log the effective IO engine choice and read path implementations info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); + info!(?conf.get_impl, "starting with get page implementation"); + info!(?conf.get_vectored_impl, "starting with vectored get page implementation"); let tenants_path = conf.tenants_path(); if !tenants_path.exists() { @@ -285,6 +287,7 @@ fn start_pageserver( )) .unwrap(); pageserver::preinitialize_metrics(); + pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 1837da34ce..258eed0b12 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -9,7 +9,7 @@ use pageserver_api::shard::TenantShardId; use remote_storage::{RemotePath, RemoteStorageConfig}; use serde; use serde::de::IntoDeserializer; -use std::{collections::HashMap, env}; +use std::env; use storage_broker::Uri; use utils::crashsafe::path_with_suffix_extension; use utils::id::ConnectionId; @@ -30,9 +30,9 @@ use utils::{ logging::LogFormat, }; -use crate::tenant::config::TenantConfOpt; use crate::tenant::timeline::GetVectoredImpl; use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; +use crate::tenant::{config::TenantConfOpt, timeline::GetImpl}; use crate::tenant::{ TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, }; @@ -51,7 +51,7 @@ pub mod defaults { use crate::tenant::config::defaults::*; use const_format::formatcp; - pub use pageserver_api::{ + pub use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_PG_LISTEN_PORT, }; @@ -91,12 +91,16 @@ pub mod defaults { pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential"; + pub const DEFAULT_GET_IMPL: &str = "legacy"; + pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true; pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; + pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync"; + /// /// Default built-in configuration file. /// @@ -136,10 +140,14 @@ pub mod defaults { #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}' +#get_impl = '{DEFAULT_GET_IMPL}' + #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}' #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}' +#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}' + [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -280,6 +288,8 @@ pub struct PageServerConf { pub get_vectored_impl: GetVectoredImpl, + pub get_impl: GetImpl, + pub max_vectored_read_bytes: MaxVectoredReadBytes, pub validate_vectored_get: bool, @@ -290,6 +300,8 @@ pub struct PageServerConf { /// /// Setting this to zero disables limits on total ephemeral layer size. pub ephemeral_bytes_per_memory_kb: usize, + + pub walredo_process_kind: crate::walredo::ProcessKind, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -323,26 +335,6 @@ impl BuilderValue { } } -// Certain metadata (e.g. externally-addressable name, AZ) is delivered -// as a separate structure. This information is not neeed by the pageserver -// itself, it is only used for registering the pageserver with the control -// plane and/or storage controller. -// -#[derive(serde::Deserialize)] -pub(crate) struct NodeMetadata { - #[serde(rename = "host")] - pub(crate) postgres_host: String, - #[serde(rename = "port")] - pub(crate) postgres_port: u16, - pub(crate) http_host: String, - pub(crate) http_port: u16, - - // Deployment tools may write fields to the metadata file beyond what we - // use in this type: this type intentionally only names fields that require. - #[serde(flatten)] - pub(crate) other: HashMap, -} - // needed to simplify config construction #[derive(Default)] struct PageServerConfigBuilder { @@ -408,11 +400,15 @@ struct PageServerConfigBuilder { get_vectored_impl: BuilderValue, + get_impl: BuilderValue, + max_vectored_read_bytes: BuilderValue, validate_vectored_get: BuilderValue, ephemeral_bytes_per_memory_kb: BuilderValue, + + walredo_process_kind: BuilderValue, } impl PageServerConfigBuilder { @@ -495,11 +491,14 @@ impl PageServerConfigBuilder { virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()), get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()), + get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()), max_vectored_read_bytes: Set(MaxVectoredReadBytes( NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), + + walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()), } } } @@ -671,6 +670,10 @@ impl PageServerConfigBuilder { self.get_vectored_impl = BuilderValue::Set(value); } + pub fn get_impl(&mut self, value: GetImpl) { + self.get_impl = BuilderValue::Set(value); + } + pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) { self.max_vectored_read_bytes = BuilderValue::Set(value); } @@ -683,6 +686,10 @@ impl PageServerConfigBuilder { self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); } + pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) { + self.walredo_process_kind = BuilderValue::Set(value); + } + pub fn build(self) -> anyhow::Result { let default = Self::default_values(); @@ -736,9 +743,11 @@ impl PageServerConfigBuilder { secondary_download_concurrency, ingest_batch_size, get_vectored_impl, + get_impl, max_vectored_read_bytes, validate_vectored_get, ephemeral_bytes_per_memory_kb, + walredo_process_kind, } CUSTOM LOGIC { @@ -1020,6 +1029,9 @@ impl PageServerConf { "get_vectored_impl" => { builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?) } + "get_impl" => { + builder.get_impl(parse_toml_from_str("get_impl", item)?) + } "max_vectored_read_bytes" => { let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize; builder.get_max_vectored_read_bytes( @@ -1032,6 +1044,9 @@ impl PageServerConf { "ephemeral_bytes_per_memory_kb" => { builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) } + "walredo_process_kind" => { + builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?) + } _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -1108,12 +1123,14 @@ impl PageServerConf { ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant"), ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), } } } @@ -1346,12 +1363,14 @@ background_task_maximum_delay = '334 s' ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant") ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, - ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB + ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), }, "Correct defaults should be used when no config values are provided" ); @@ -1418,12 +1437,14 @@ background_task_maximum_delay = '334 s' ingest_batch_size: 100, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant") ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, - ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB + ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), }, "Should be able to parse all basic config values correctly" ); @@ -1536,6 +1557,7 @@ broker_endpoint = '{broker_endpoint}' endpoint: Some(endpoint.clone()), concurrency_limit: s3_concurrency_limit, max_keys_per_list_response: None, + upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }, diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index f5540e896f..62bbde42f4 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker( continue; } - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // We only send consumption metrics from shard 0, so don't waste time calculating // synthetic size on other shards. continue; diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 6740c1360b..7ba2d04c4f 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics( }; let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move { - if state != TenantState::Active || !id.is_zero() { + if state != TenantState::Active || !id.is_shard_zero() { None } else { tenant_manager diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index f0ed46ce23..26e7cc7ef8 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -14,10 +14,8 @@ use tokio_util::sync::CancellationToken; use url::Url; use utils::{backoff, failpoint_support, generation::Generation, id::NodeId}; -use crate::{ - config::{NodeMetadata, PageServerConf}, - virtual_file::on_fatal_io_error, -}; +use crate::{config::PageServerConf, virtual_file::on_fatal_io_error}; +use pageserver_api::config::NodeMetadata; /// The Pageserver's client for using the control plane API: this is a small subset /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md) @@ -65,7 +63,7 @@ impl ControlPlaneClient { let mut client = reqwest::ClientBuilder::new(); if let Some(jwt) = &conf.control_plane_api_token { - let mut headers = hyper::HeaderMap::new(); + let mut headers = reqwest::header::HeaderMap::new(); headers.insert( "Authorization", format!("Bearer {}", jwt.get_contents()).parse().unwrap(), diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 2713309824..c425f3e628 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -58,24 +58,6 @@ paths: responses: "200": description: The reload completed successfully. - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error (also hits if no keys were found) - content: - application/json: - schema: - $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}: parameters: @@ -93,62 +75,14 @@ paths: application/json: schema: $ref: "#/components/schemas/TenantInfo" - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" delete: description: | Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved. 404 means that deletion successfully finished" responses: - "400": - description: Error when no tenant id found in path - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "404": - description: Tenant not found + description: Tenant not found. This is the success path. content: application/json: schema: @@ -165,18 +99,6 @@ paths: application/json: schema: $ref: "#/components/schemas/PreconditionFailedError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/time_travel_remote_storage: parameters: @@ -206,36 +128,6 @@ paths: application/json: schema: type: string - "400": - description: Error when no tenant id found in path or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline: parameters: @@ -255,36 +147,6 @@ paths: type: array items: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Error when no tenant id found in path - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline/{timeline_id}: @@ -309,60 +171,12 @@ paths: application/json: schema: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" delete: description: "Attempts to delete specified timeline. 500 and 409 errors should be retried" responses: - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "404": - description: Timeline not found + description: Timeline not found. This is the success path. content: application/json: schema: @@ -379,18 +193,6 @@ paths: application/json: schema: $ref: "#/components/schemas/PreconditionFailedError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn: parameters: @@ -423,36 +225,6 @@ paths: schema: type: string format: date-time - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Timeline not found, or there is no timestamp information for the given lsn - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp: parameters: @@ -484,36 +256,6 @@ paths: application/json: schema: $ref: "#/components/schemas/LsnByTimestampResponse" - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc: parameters: @@ -537,36 +279,6 @@ paths: application/json: schema: type: string - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_shard_id}/location_config: parameters: - name: tenant_shard_id @@ -628,24 +340,6 @@ paths: application/json: schema: $ref: "#/components/schemas/TenantLocationConfigResponse" - "503": - description: Tenant's state cannot be changed right now. Wait a few seconds and retry. - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "409": description: | The tenant is already known to Pageserver in some way, @@ -662,12 +356,6 @@ paths: application/json: schema: $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/ignore: parameters: - name: tenant_id @@ -684,36 +372,6 @@ paths: responses: "200": description: Tenant ignored - "400": - description: Error when no tenant id found in path parameters - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/load: @@ -740,36 +398,6 @@ paths: responses: "202": description: Tenant scheduled to load successfully - "400": - description: Error when no tenant id found in path parameters - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive: parameters: @@ -790,37 +418,6 @@ paths: responses: "202": description: Tenant scheduled to load successfully - "404": - description: No tenant or timeline found for the specified ids - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - /v1/tenant/{tenant_id}/synthetic_size: parameters: @@ -839,31 +436,8 @@ paths: application/json: schema: $ref: "#/components/schemas/SyntheticSizeResponse" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" + # This route has no handler. TODO: remove? /v1/tenant/{tenant_id}/size: parameters: - name: tenant_id @@ -945,18 +519,6 @@ paths: responses: "200": description: Success - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_shard_id}/secondary/download: parameters: @@ -987,20 +549,6 @@ paths: application/json: schema: $ref: "#/components/schemas/SecondaryProgress" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - /v1/tenant/{tenant_id}/timeline/: parameters: @@ -1043,24 +591,6 @@ paths: application/json: schema: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Malformed timeline create request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "406": description: Permanently unsatisfiable request, don't retry. content: @@ -1079,18 +609,6 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/: get: @@ -1104,30 +622,6 @@ paths: type: array items: $ref: "#/components/schemas/TenantInfo" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" post: description: | @@ -1148,43 +642,12 @@ paths: application/json: schema: type: string - "400": - description: Malformed tenant create request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "409": description: Tenant already exists, creation skipped content: application/json: schema: $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - /v1/tenant/config: put: @@ -1206,36 +669,6 @@ paths: type: array items: $ref: "#/components/schemas/TenantInfo" - "400": - description: Malformed tenant config request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/config/: parameters: @@ -1255,42 +688,6 @@ paths: application/json: schema: $ref: "#/components/schemas/TenantConfigResponse" - "400": - description: Malformed get tenanant config request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Tenand or timeline were not found - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/utilization: get: @@ -1304,12 +701,6 @@ paths: application/json: schema: $ref: "#/components/schemas/PageserverUtilization" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" components: securitySchemes: @@ -1391,9 +782,6 @@ components: required: - mode properties: - tenant_id: - type: string - description: Not used, scheduled for removal. mode: type: string enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 47d8ae1148..cf526940f4 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -19,6 +19,8 @@ use pageserver_api::models::LocationConfigListResponse; use pageserver_api::models::ShardParameters; use pageserver_api::models::TenantDetails; use pageserver_api::models::TenantLocationConfigResponse; +use pageserver_api::models::TenantScanRemoteStorageResponse; +use pageserver_api::models::TenantScanRemoteStorageShard; use pageserver_api::models::TenantShardLocation; use pageserver_api::models::TenantShardSplitRequest; use pageserver_api::models::TenantShardSplitResponse; @@ -29,6 +31,7 @@ use pageserver_api::models::{ }; use pageserver_api::shard::ShardCount; use pageserver_api::shard::TenantShardId; +use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use remote_storage::TimeTravelError; use tenant_size_model::{SizeResult, StorageModel}; @@ -54,6 +57,9 @@ use crate::tenant::mgr::{ }; use crate::tenant::mgr::{TenantSlot, UpsertLocationError}; use crate::tenant::remote_timeline_client; +use crate::tenant::remote_timeline_client::download_index_part; +use crate::tenant::remote_timeline_client::list_remote_tenant_shards; +use crate::tenant::remote_timeline_client::list_remote_timelines; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; @@ -160,6 +166,9 @@ impl From for ApiError { fn from(pre: PageReconstructError) -> ApiError { match pre { PageReconstructError::Other(pre) => ApiError::InternalServerError(pre), + PageReconstructError::MissingKey(e) => { + ApiError::InternalServerError(anyhow::anyhow!("{e}")) + } PageReconstructError::Cancelled => { ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) } @@ -457,8 +466,12 @@ async fn reload_auth_validation_keys_handler( json_response(StatusCode::OK, ()) } Err(e) => { + let err_msg = "Error reloading public keys"; warn!("Error reloading public keys from {key_path:?}: {e:}"); - json_response(StatusCode::INTERNAL_SERVER_ERROR, ()) + json_response( + StatusCode::INTERNAL_SERVER_ERROR, + HttpErrorBody::from_msg(err_msg.to_string()), + ) } } } @@ -696,7 +709,7 @@ async fn get_lsn_by_timestamp_handler( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" @@ -747,7 +760,7 @@ async fn get_timestamp_of_lsn_handler( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" @@ -772,7 +785,9 @@ async fn get_timestamp_of_lsn_handler( let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string(); json_response(StatusCode::OK, time) } - None => json_response(StatusCode::NOT_FOUND, ()), + None => Err(ApiError::NotFound( + anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(), + )), } } @@ -1086,7 +1101,7 @@ async fn tenant_size_handler( let headers = request.headers(); let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" ))); @@ -1903,12 +1918,14 @@ async fn timeline_collect_keyspace( let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); - let keys = timeline + let (dense_ks, sparse_ks) = timeline .collect_keyspace(at_lsn, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; - let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn }; + // This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace. + // Therefore, we split dense/sparse keys in this API. + let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn }; json_response(StatusCode::OK, res) } @@ -2026,6 +2043,79 @@ async fn secondary_upload_handler( json_response(StatusCode::OK, ()) } +async fn tenant_scan_remote_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + + let Some(remote_storage) = state.remote_storage.as_ref() else { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Remote storage not configured" + ))); + }; + + let mut response = TenantScanRemoteStorageResponse::default(); + + let (shards, _other_keys) = + list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone()) + .await + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; + + for tenant_shard_id in shards { + let (timeline_ids, _other_keys) = + list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone()) + .await + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; + + let mut generation = Generation::none(); + for timeline_id in timeline_ids { + match download_index_part( + remote_storage, + &tenant_shard_id, + &timeline_id, + Generation::MAX, + &cancel, + ) + .instrument(info_span!("download_index_part", + tenant_id=%tenant_shard_id.tenant_id, + shard_id=%tenant_shard_id.shard_slug(), + %timeline_id)) + .await + { + Ok((index_part, index_generation)) => { + tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", + index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn()); + generation = std::cmp::max(generation, index_generation); + } + Err(DownloadError::NotFound) => { + // This is normal for tenants that were created with multiple shards: they have an unsharded path + // containing the timeline's initdb tarball but no index. Otherwise it is a bit strange. + tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping"); + continue; + } + Err(e) => { + return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); + } + }; + } + + response.shards.push(TenantScanRemoteStorageShard { + tenant_shard_id, + generation: generation.into(), + }); + } + + if response.shards.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("No shards found for tenant ID {tenant_id}").into(), + )); + } + + json_response(StatusCode::OK, response) +} + async fn secondary_download_handler( request: Request, _cancel: CancellationToken, @@ -2070,6 +2160,27 @@ async fn secondary_download_handler( json_response(status, progress) } +async fn secondary_status_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + + let Some(secondary_tenant) = state + .tenant_manager + .get_secondary_tenant_shard(tenant_shard_id) + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(), + )); + }; + + let progress = secondary_tenant.progress.lock().unwrap().clone(); + + json_response(StatusCode::OK, progress) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -2422,12 +2533,18 @@ pub fn make_router( .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| { api_handler(r, secondary_upload_handler) }) + .get("/v1/tenant/:tenant_id/scan_remote_storage", |r| { + api_handler(r, tenant_scan_remote_handler) + }) .put("/v1/disk_usage_eviction/run", |r| { api_handler(r, disk_usage_eviction_run) }) .put("/v1/deletion_queue/flush", |r| { api_handler(r, deletion_queue_flush) }) + .get("/v1/tenant/:tenant_shard_id/secondary/status", |r| { + api_handler(r, secondary_status_handler) + }) .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| { api_handler(r, secondary_download_handler) }) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index f947a75f61..930700e50c 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -12,6 +12,7 @@ pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; pub use pageserver_api::keyspace; +pub mod aux_file; pub mod metrics; pub mod page_cache; pub mod page_service; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 3160f204e2..903bad34cc 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -51,6 +51,9 @@ pub(crate) enum StorageTimeOperation { #[strum(serialize = "gc")] Gc, + #[strum(serialize = "find gc cutoffs")] + FindGcCutoffs, + #[strum(serialize = "create tenant")] CreateTenant, } @@ -86,41 +89,58 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub(crate) static READ_NUM_FS_LAYERS: Lazy = Lazy::new(|| { +pub(crate) static READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { register_histogram!( - "pageserver_read_num_fs_layers", - "Number of persistent layers accessed for processing a read request, including those in the cache", - vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0], + "pageserver_layers_visited_per_read_global", + "Number of layers visited to reconstruct one key", + vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], + ) + .expect("failed to define a metric") +}); + +pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_layers_visited_per_vectored_read_global", + "Average number of layers visited to reconstruct one key", + vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], ) .expect("failed to define a metric") }); // Metrics collected on operations on the storage repository. +#[derive( + Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr, +)] +pub(crate) enum GetKind { + Singular, + Vectored, +} pub(crate) struct ReconstructTimeMetrics { - ok: Histogram, - err: Histogram, + singular: Histogram, + vectored: Histogram, } pub(crate) static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_getpage_reconstruct_seconds", "Time spent in reconstruct_value (reconstruct a page from deltas)", - &["result"], + &["get_kind"], CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric"); + ReconstructTimeMetrics { - ok: inner.get_metric_with_label_values(&["ok"]).unwrap(), - err: inner.get_metric_with_label_values(&["err"]).unwrap(), + singular: inner.with_label_values(&[GetKind::Singular.into()]), + vectored: inner.with_label_values(&[GetKind::Vectored.into()]), } }); impl ReconstructTimeMetrics { - pub(crate) fn for_result(&self, result: &Result) -> &Histogram { - match result { - Ok(_) => &self.ok, - Err(_) => &self.err, + pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { + match get_kind { + GetKind::Singular => &self.singular, + GetKind::Vectored => &self.vectored, } } } @@ -133,13 +153,33 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy = Lazy::n .expect("failed to define a metric") }); -pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { - register_histogram!( +pub(crate) struct ReconstructDataTimeMetrics { + singular: Histogram, + vectored: Histogram, +} + +impl ReconstructDataTimeMetrics { + pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { + match get_kind { + GetKind::Singular => &self.singular, + GetKind::Vectored => &self.vectored, + } + } +} + +pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { + let inner = register_histogram_vec!( "pageserver_getpage_get_reconstruct_data_seconds", "Time spent in get_reconstruct_value_data", + &["get_kind"], CRITICAL_OP_BUCKETS.into(), ) - .expect("failed to define a metric") + .expect("failed to define a metric"); + + ReconstructDataTimeMetrics { + singular: inner.with_label_values(&[GetKind::Singular.into()]), + vectored: inner.with_label_values(&[GetKind::Vectored.into()]), + } }); pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { @@ -154,6 +194,11 @@ pub(crate) struct GetVectoredLatency { map: EnumMap>, } +#[allow(dead_code)] +pub(crate) struct ScanLatency { + map: EnumMap>, +} + impl GetVectoredLatency { // Only these task types perform vectored gets. Filter all other tasks out to reduce total // cardinality of the metric. @@ -164,6 +209,48 @@ impl GetVectoredLatency { } } +impl ScanLatency { + // Only these task types perform vectored gets. Filter all other tasks out to reduce total + // cardinality of the metric. + const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler]; + + pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> { + self.map[task_kind].as_ref() + } +} + +pub(crate) struct ScanLatencyOngoingRecording<'a> { + parent: &'a Histogram, + start: std::time::Instant, +} + +impl<'a> ScanLatencyOngoingRecording<'a> { + pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> { + let start = Instant::now(); + ScanLatencyOngoingRecording { parent, start } + } + + pub(crate) fn observe(self, throttled: Option) { + let elapsed = self.start.elapsed(); + let ex_throttled = if let Some(throttled) = throttled { + elapsed.checked_sub(throttled) + } else { + Some(elapsed) + }; + if let Some(ex_throttled) = ex_throttled { + self.parent.observe(ex_throttled.as_secs_f64()); + } else { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!("error deducting time spent throttled; this message is logged at a global rate limit"); + }); + } + } +} + pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_get_vectored_seconds", @@ -187,6 +274,29 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| } }); +pub(crate) static SCAN_LATENCY: Lazy = Lazy::new(|| { + let inner = register_histogram_vec!( + "pageserver_scan_seconds", + "Time spent in scan, excluding time spent in timeline_get_throttle.", + &["task_kind"], + CRITICAL_OP_BUCKETS.into(), + ) + .expect("failed to define a metric"); + + ScanLatency { + map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { + let task_kind = ::from_usize(task_kind_idx); + + if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) { + let task_kind = task_kind.into(); + Some(inner.with_label_values(&[task_kind])) + } else { + None + } + })), + } +}); + pub(crate) struct PageCacheMetricsForTaskKind { pub read_accesses_materialized_page: IntCounter, pub read_accesses_immutable: IntCounter, @@ -1482,35 +1592,6 @@ pub(crate) static DELETION_QUEUE: Lazy = Lazy::new(|| { } }); -pub(crate) struct WalIngestMetrics { - pub(crate) bytes_received: IntCounter, - pub(crate) records_received: IntCounter, - pub(crate) records_committed: IntCounter, - pub(crate) records_filtered: IntCounter, -} - -pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { - bytes_received: register_int_counter!( - "pageserver_wal_ingest_bytes_received", - "Bytes of WAL ingested from safekeepers", - ) - .unwrap(), - records_received: register_int_counter!( - "pageserver_wal_ingest_records_received", - "Number of WAL records received from safekeepers" - ) - .expect("failed to define a metric"), - records_committed: register_int_counter!( - "pageserver_wal_ingest_records_committed", - "Number of WAL records which resulted in writes to pageserver storage" - ) - .expect("failed to define a metric"), - records_filtered: register_int_counter!( - "pageserver_wal_ingest_records_filtered", - "Number of WAL records filtered out due to sharding" - ) - .expect("failed to define a metric"), -}); pub(crate) struct SecondaryModeMetrics { pub(crate) upload_heatmap: IntCounter, pub(crate) upload_heatmap_errors: IntCounter, @@ -1518,7 +1599,8 @@ pub(crate) struct SecondaryModeMetrics { pub(crate) download_heatmap: IntCounter, pub(crate) download_layer: IntCounter, } -pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| SecondaryModeMetrics { +pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| { + SecondaryModeMetrics { upload_heatmap: register_int_counter!( "pageserver_secondary_upload_heatmap", "Number of heatmaps written to remote storage by attached tenants" @@ -1536,7 +1618,7 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| Seco .expect("failed to define a metric"), download_heatmap: register_int_counter!( "pageserver_secondary_download_heatmap", - "Number of downloads of heatmaps by secondary mode locations" + "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed" ) .expect("failed to define a metric"), download_layer: register_int_counter!( @@ -1544,6 +1626,7 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| Seco "Number of downloads of layers by secondary mode locations" ) .expect("failed to define a metric"), +} }); #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -1710,6 +1793,43 @@ macro_rules! redo_bytes_histogram_count_buckets { }; } +pub(crate) struct WalIngestMetrics { + pub(crate) bytes_received: IntCounter, + pub(crate) records_received: IntCounter, + pub(crate) records_committed: IntCounter, + pub(crate) records_filtered: IntCounter, + pub(crate) time_spent_on_ingest: Histogram, +} + +pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { + bytes_received: register_int_counter!( + "pageserver_wal_ingest_bytes_received", + "Bytes of WAL ingested from safekeepers", + ) + .unwrap(), + records_received: register_int_counter!( + "pageserver_wal_ingest_records_received", + "Number of WAL records received from safekeepers" + ) + .expect("failed to define a metric"), + records_committed: register_int_counter!( + "pageserver_wal_ingest_records_committed", + "Number of WAL records which resulted in writes to pageserver storage" + ) + .expect("failed to define a metric"), + records_filtered: register_int_counter!( + "pageserver_wal_ingest_records_filtered", + "Number of WAL records filtered out due to sharding" + ) + .expect("failed to define a metric"), + time_spent_on_ingest: register_histogram!( + "pageserver_wal_ingest_put_value_seconds", + "Actual time spent on ingesting a record", + redo_histogram_time_buckets!(), + ) + .expect("failed to define a metric"), +}); + pub(crate) static WAL_REDO_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_seconds", @@ -1819,6 +1939,29 @@ impl Default for WalRedoProcessCounters { pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy = Lazy::new(WalRedoProcessCounters::default); +#[cfg(not(test))] +pub mod wal_redo { + use super::*; + + static PROCESS_KIND: Lazy> = Lazy::new(|| { + std::sync::Mutex::new( + register_uint_gauge_vec!( + "pageserver_wal_redo_process_kind", + "The configured process kind for walredo", + &["kind"], + ) + .unwrap(), + ) + }); + + pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) { + // use guard to avoid races around the next two steps + let guard = PROCESS_KIND.lock().unwrap(); + guard.reset(); + guard.with_label_values(&[&format!("{kind}")]).set(1); + } +} + /// Similar to `prometheus::HistogramTimer` but does not record on drop. pub(crate) struct StorageTimeMetricsTimer { metrics: StorageTimeMetrics, @@ -1840,6 +1983,22 @@ impl StorageTimeMetricsTimer { self.metrics.timeline_count.inc(); self.metrics.global_histogram.observe(duration); } + + /// Turns this timer into a timer, which will always record -- usually this means recording + /// regardless an early `?` path was taken in a function. + pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer { + AlwaysRecordingStorageTimeMetricsTimer(Some(self)) + } +} + +pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option); + +impl Drop for AlwaysRecordingStorageTimeMetricsTimer { + fn drop(&mut self) { + if let Some(inner) = self.0.take() { + inner.stop_and_record(); + } + } } /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and @@ -1900,6 +2059,7 @@ pub(crate) struct TimelineMetrics { pub imitate_logical_size_histo: StorageTimeMetrics, pub load_layer_map_histo: StorageTimeMetrics, pub garbage_collect_histo: StorageTimeMetrics, + pub find_gc_cutoffs_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size @@ -1960,6 +2120,12 @@ impl TimelineMetrics { &shard_id, &timeline_id, ); + let find_gc_cutoffs_histo = StorageTimeMetrics::new( + StorageTimeOperation::FindGcCutoffs, + &tenant_id, + &shard_id, + &timeline_id, + ); let last_record_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2002,6 +2168,7 @@ impl TimelineMetrics { logical_size_histo, imitate_logical_size_histo, garbage_collect_histo, + find_gc_cutoffs_histo, load_layer_map_histo, last_record_gauge, resident_physical_size_gauge, @@ -2089,7 +2256,7 @@ impl TimelineMetrics { pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { // Only shard zero deals in synthetic sizes - if tenant_shard_id.is_zero() { + if tenant_shard_id.is_shard_zero() { let tid = tenant_shard_id.tenant_id.to_string(); let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); } @@ -2746,7 +2913,8 @@ pub fn preinitialize_metrics() { // histograms [ - &READ_NUM_FS_LAYERS, + &READ_NUM_LAYERS_VISITED, + &VEC_READ_NUM_LAYERS_VISITED, &WAIT_LSN_TIME, &WAL_REDO_TIME, &WAL_REDO_RECORDS_HISTOGRAM, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 5956b3c322..748c4b18e3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1,13 +1,5 @@ -// //! The Page Service listens for client connections and serves their GetPage@LSN //! requests. -// -// It is possible to connect here using usual psql/pgbench/libpq. Following -// commands are supported now: -// *status* -- show actual info about this pageserver, -// *pagestream* -- enter mode where smgr and pageserver talk with their -// custom protocol. -// use anyhow::Context; use async_compression::tokio::write::GzipEncoder; @@ -23,7 +15,7 @@ use pageserver_api::models::{ PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, - PagestreamNblocksResponse, + PagestreamNblocksResponse, PagestreamProtocolVersion, }; use pageserver_api::shard::ShardIndex; use pageserver_api::shard::ShardNumber; @@ -56,6 +48,7 @@ use utils::{ use crate::auth::check_permission; use crate::basebackup; +use crate::basebackup::BasebackupError; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir::import_wal_from_tar; @@ -551,6 +544,7 @@ impl PageServerHandler { pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, + protocol_version: PagestreamProtocolVersion, ctx: RequestContext, ) -> Result<(), QueryError> where @@ -613,14 +607,15 @@ impl PageServerHandler { t.trace(©_data_bytes) } - let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; + let neon_fe_msg = + PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?; // TODO: We could create a new per-request context here, with unique ID. // Currently we use the same per-timeline context for all requests let (response, span) = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { - let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn); + let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn); ( self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) @@ -629,7 +624,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::Nblocks(req) => { - let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn); + let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn); ( self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) @@ -639,7 +634,7 @@ impl PageServerHandler { } PagestreamFeMessage::GetPage(req) => { // shard_id is filled in by the handler - let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn); + let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn); ( self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) @@ -648,7 +643,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::DbSize(req) => { - let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn); + let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn); ( self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) @@ -657,7 +652,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::GetSlruSegment(req) => { - let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn); + let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn); ( self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) @@ -838,78 +833,80 @@ impl PageServerHandler { /// Helper function to handle the LSN from client request. /// /// Each GetPage (and Exists and Nblocks) request includes information about - /// which version of the page is being requested. The client can request the - /// latest version of the page, or the version that's valid at a particular - /// LSN. The primary compute node will always request the latest page - /// version, while a standby will request a version at the LSN that it's - /// currently caught up to. + /// which version of the page is being requested. The primary compute node + /// will always request the latest page version, by setting 'request_lsn' to + /// the last inserted or flushed WAL position, while a standby will request + /// a version at the LSN that it's currently caught up to. /// /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. + /// + /// In addition to the request LSN, each request carries another LSN, + /// 'not_modified_since', which is a hint to the pageserver that the client + /// knows that the page has not been modified between 'not_modified_since' + /// and the request LSN. This allows skipping the wait, as long as the WAL + /// up to 'not_modified_since' has arrived. If the client doesn't have any + /// information about when the page was modified, it will use + /// not_modified_since == lsn. If the client lies and sends a too low + /// not_modified_hint such that there are in fact later page versions, the + /// behavior is undefined: the pageserver may return any of the page versions + /// or an error. async fn wait_or_get_last_lsn( timeline: &Timeline, - mut lsn: Lsn, - latest: bool, + request_lsn: Lsn, + not_modified_since: Lsn, latest_gc_cutoff_lsn: &RcuReadGuard, ctx: &RequestContext, ) -> Result { - if latest { - // Latest page version was requested. If LSN is given, it is a hint - // to the page server that there have been no modifications to the - // page after that LSN. If we haven't received WAL up to that point, - // wait until it arrives. - let last_record_lsn = timeline.get_last_record_lsn(); + let last_record_lsn = timeline.get_last_record_lsn(); - // Note: this covers the special case that lsn == Lsn(0). That - // special case means "return the latest version whatever it is", - // and it's used for bootstrapping purposes, when the page server is - // connected directly to the compute node. That is needed because - // when you connect to the compute node, to receive the WAL, the - // walsender process will do a look up in the pg_authid catalog - // table for authentication. That poses a deadlock problem: the - // catalog table lookup will send a GetPage request, but the GetPage - // request will block in the page server because the recent WAL - // hasn't been received yet, and it cannot be received until the - // walsender completes the authentication and starts streaming the - // WAL. - if lsn <= last_record_lsn { - lsn = last_record_lsn; + // Sanity check the request + if request_lsn < not_modified_since { + return Err(PageStreamError::BadRequest( + format!( + "invalid request with request LSN {} and not_modified_since {}", + request_lsn, not_modified_since, + ) + .into(), + )); + } + + if request_lsn < **latest_gc_cutoff_lsn { + // Check explicitly for INVALID just to get a less scary error message if the + // request is obviously bogus + return Err(if request_lsn == Lsn::INVALID { + PageStreamError::BadRequest("invalid LSN(0) in request".into()) } else { - timeline - .wait_lsn( - lsn, - crate::tenant::timeline::WaitLsnWaiter::PageService, - ctx, - ) - .await?; - // Since we waited for 'lsn' to arrive, that is now the last - // record LSN. (Or close enough for our purposes; the - // last-record LSN can advance immediately after we return - // anyway) - } - } else { - if lsn == Lsn(0) { - return Err(PageStreamError::BadRequest( - "invalid LSN(0) in request".into(), - )); - } + PageStreamError::BadRequest(format!( + "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", + request_lsn, **latest_gc_cutoff_lsn + ).into()) + }); + } + + // Wait for WAL up to 'not_modified_since' to arrive, if necessary + if not_modified_since > last_record_lsn { timeline .wait_lsn( - lsn, + not_modified_since, crate::tenant::timeline::WaitLsnWaiter::PageService, ctx, ) .await?; + // Since we waited for 'not_modified_since' to arrive, that is now the last + // record LSN. (Or close enough for our purposes; the last-record LSN can + // advance immediately after we return anyway) + Ok(not_modified_since) + } else { + // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn) + // here instead. That would give the same result, since we know that there + // haven't been any modifications since 'not_modified_since'. Using an older + // LSN might be faster, because that could allow skipping recent layers when + // finding the page. However, we have historically used 'last_record_lsn', so + // stick to that for now. + Ok(std::cmp::min(last_record_lsn, request_lsn)) } - - if lsn < **latest_gc_cutoff_lsn { - return Err(PageStreamError::BadRequest(format!( - "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", - lsn, **latest_gc_cutoff_lsn - ).into())); - } - Ok(lsn) } #[instrument(skip_all, fields(shard_id))] @@ -926,12 +923,17 @@ impl PageServerHandler { .start_timer(metrics::SmgrQueryType::GetRelExists, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let exists = timeline - .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx) + .get_rel_exists(req.rel, Version::Lsn(lsn), ctx) .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { @@ -954,12 +956,17 @@ impl PageServerHandler { .start_timer(metrics::SmgrQueryType::GetRelSize, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let n_blocks = timeline - .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx) + .get_rel_size(req.rel, Version::Lsn(lsn), ctx) .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { @@ -982,18 +989,17 @@ impl PageServerHandler { .start_timer(metrics::SmgrQueryType::GetDbSize, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let total_blocks = timeline - .get_db_size( - DEFAULTTABLESPACE_OID, - req.dbnode, - Version::Lsn(lsn), - req.latest, - ctx, - ) + .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx) .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; @@ -1160,12 +1166,17 @@ impl PageServerHandler { .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let page = timeline - .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx) + .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx) .await?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { @@ -1188,9 +1199,14 @@ impl PageServerHandler { .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let kind = SlruKind::from_repr(req.kind) .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?; @@ -1201,6 +1217,10 @@ impl PageServerHandler { )) } + /// Note on "fullbackup": + /// Full basebackups should only be used for debugging purposes. + /// Originally, it was introduced to enable breaking storage format changes, + /// but that is not applicable anymore. #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))] async fn handle_basebackup_request( @@ -1217,6 +1237,13 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { + fn map_basebackup_error(err: BasebackupError) -> QueryError { + match err { + BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)), + BasebackupError::Server(e) => QueryError::Other(e), + } + } + let started = std::time::Instant::now(); // check that the timeline exists @@ -1242,7 +1269,8 @@ impl PageServerHandler { let lsn_awaited_after = started.elapsed(); // switch client to COPYOUT - pgb.write_message_noflush(&BeMessage::CopyOutResponse)?; + pgb.write_message_noflush(&BeMessage::CopyOutResponse) + .map_err(QueryError::Disconnected)?; self.flush_cancellable(pgb, &timeline.cancel).await?; // Send a tarball of the latest layer on the timeline. Compress if not @@ -1257,7 +1285,8 @@ impl PageServerHandler { full_backup, ctx, ) - .await?; + .await + .map_err(map_basebackup_error)?; } else { let mut writer = pgb.copyout_writer(); if gzip { @@ -1278,9 +1307,13 @@ impl PageServerHandler { full_backup, ctx, ) - .await?; + .await + .map_err(map_basebackup_error)?; // shutdown the encoder to ensure the gzip footer is written - encoder.shutdown().await?; + encoder + .shutdown() + .await + .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?; } else { basebackup::send_basebackup_tarball( &mut writer, @@ -1290,11 +1323,13 @@ impl PageServerHandler { full_backup, ctx, ) - .await?; + .await + .map_err(map_basebackup_error)?; } } - pgb.write_message_noflush(&BeMessage::CopyDone)?; + pgb.write_message_noflush(&BeMessage::CopyDone) + .map_err(QueryError::Disconnected)?; self.flush_cancellable(pgb, &timeline.cancel).await?; let basebackup_after = started @@ -1403,7 +1438,34 @@ where let ctx = self.connection_ctx.attached_child(); debug!("process query {query_string:?}"); - if query_string.starts_with("pagestream ") { + if query_string.starts_with("pagestream_v2 ") { + let (_, params_raw) = query_string.split_at("pagestream_v2 ".len()); + let params = params_raw.split(' ').collect::>(); + if params.len() != 2 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for pagestream command" + ))); + } + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; + + tracing::Span::current() + .record("tenant_id", field::display(tenant_id)) + .record("timeline_id", field::display(timeline_id)); + + self.check_permission(Some(tenant_id))?; + + self.handle_pagerequests( + pgb, + tenant_id, + timeline_id, + PagestreamProtocolVersion::V2, + ctx, + ) + .await?; + } else if query_string.starts_with("pagestream ") { let (_, params_raw) = query_string.split_at("pagestream ".len()); let params = params_raw.split(' ').collect::>(); if params.len() != 2 { @@ -1422,8 +1484,14 @@ where self.check_permission(Some(tenant_id))?; - self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx) - .await?; + self.handle_pagerequests( + pgb, + tenant_id, + timeline_id, + PagestreamProtocolVersion::V1, + ctx, + ) + .await?; } else if query_string.starts_with("basebackup ") { let (_, params_raw) = query_string.split_at("basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 6f7d74bdee..12314c5961 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -9,6 +9,7 @@ use super::tenant::{PageReconstructError, Timeline}; use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; +use crate::metrics::WAL_INGEST; use crate::repository::*; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::walrecord::NeonWalRecord; @@ -22,6 +23,7 @@ use pageserver_api::key::{ slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, }; +use pageserver_api::keyspace::SparseKeySpace; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; @@ -175,7 +177,6 @@ impl Timeline { tag: RelTag, blknum: BlockNumber, version: Version<'_>, - latest: bool, ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { @@ -184,7 +185,7 @@ impl Timeline { )); } - let nblocks = self.get_rel_size(tag, version, latest, ctx).await?; + let nblocks = self.get_rel_size(tag, version, ctx).await?; if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", @@ -206,7 +207,6 @@ impl Timeline { spcnode: Oid, dbnode: Oid, version: Version<'_>, - latest: bool, ctx: &RequestContext, ) -> Result { let mut total_blocks = 0; @@ -214,7 +214,7 @@ impl Timeline { let rels = self.list_rels(spcnode, dbnode, version, ctx).await?; for rel in rels { - let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?; + let n_blocks = self.get_rel_size(rel, version, ctx).await?; total_blocks += n_blocks as usize; } Ok(total_blocks) @@ -225,7 +225,6 @@ impl Timeline { &self, tag: RelTag, version: Version<'_>, - latest: bool, ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { @@ -239,7 +238,7 @@ impl Timeline { } if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) - && !self.get_rel_exists(tag, version, latest, ctx).await? + && !self.get_rel_exists(tag, version, ctx).await? { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, @@ -252,16 +251,8 @@ impl Timeline { let mut buf = version.get(self, key, ctx).await?; let nblocks = buf.get_u32_le(); - if latest { - // Update relation size cache only if "latest" flag is set. - // This flag is set by compute when it is working with most recent version of relation. - // Typically master compute node always set latest=true. - // Please notice, that even if compute node "by mistake" specifies old LSN but set - // latest=true, then it can not cause cache corruption, because with latest=true - // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be - // associated with most recent value of LSN. - self.update_cached_rel_size(tag, version.get_lsn(), nblocks); - } + self.update_cached_rel_size(tag, version.get_lsn(), nblocks); + Ok(nblocks) } @@ -270,7 +261,6 @@ impl Timeline { &self, tag: RelTag, version: Version<'_>, - _latest: bool, ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { @@ -289,7 +279,7 @@ impl Timeline { match RelDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { - let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); + let exists = dir.rels.contains(&(tag.relnode, tag.forknum)); Ok(exists) } Err(e) => Err(PageReconstructError::from(e)), @@ -389,7 +379,7 @@ impl Timeline { match SlruSegmentDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { - let exists = dir.segments.get(&segno).is_some(); + let exists = dir.segments.contains(&segno); Ok(exists) } Err(e) => Err(PageReconstructError::from(e)), @@ -466,6 +456,12 @@ impl Timeline { // Didn't find any commit timestamps smaller than the request Ok(LsnForTimestamp::Past(min_lsn)) } + (true, _) if commit_lsn < min_lsn => { + // the search above did set found_smaller to true but it never increased the lsn. + // Then, low is still the old min_lsn, and the subtraction above gave a value + // below the min_lsn. We should never do that. + Ok(LsnForTimestamp::Past(min_lsn)) + } (true, false) => { // Only found commits with timestamps smaller than the request. // It's still a valid case for branch creation, return it. @@ -735,11 +731,13 @@ impl Timeline { /// Get a KeySpace that covers all the Keys that are in use at the given LSN. /// Anything that's not listed maybe removed from the underlying storage (from /// that LSN forwards). + /// + /// The return value is (dense keyspace, sparse keyspace). pub(crate) async fn collect_keyspace( &self, lsn: Lsn, ctx: &RequestContext, - ) -> Result { + ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> { // Iterate through key ranges, greedily packing them into partitions let mut result = KeySpaceAccum::new(); @@ -811,13 +809,18 @@ impl Timeline { if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() { result.add_key(AUX_FILES_KEY); } - Ok(result.to_keyspace()) + + Ok(( + result.to_keyspace(), + /* AUX sparse key space */ + SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())), + )) } /// Get cached size of relation if it not updated after specified LSN pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option { let rel_size_cache = self.rel_size_cache.read().unwrap(); - if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) { + if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) { if lsn >= *cached_lsn { return Some(*nblocks); } @@ -828,7 +831,16 @@ impl Timeline { /// Update cached relation size if there is no more recent update pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - match rel_size_cache.entry(tag) { + + if lsn < rel_size_cache.complete_as_of { + // Do not cache old values. It's safe to cache the size on read, as long as + // the read was at an LSN since we started the WAL ingestion. Reasoning: we + // never evict values from the cache, so if the relation size changed after + // 'lsn', the new value is already in the cache. + return; + } + + match rel_size_cache.map.entry(tag) { hash_map::Entry::Occupied(mut entry) => { let cached_lsn = entry.get_mut(); if lsn >= cached_lsn.0 { @@ -844,13 +856,13 @@ impl Timeline { /// Store cached relation size pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - rel_size_cache.insert(tag, (lsn, nblocks)); + rel_size_cache.map.insert(tag, (lsn, nblocks)); } /// Remove cached relation size pub fn remove_cached_rel_size(&self, tag: &RelTag) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - rel_size_cache.remove(tag); + rel_size_cache.map.remove(tag); } } @@ -1088,7 +1100,7 @@ impl<'a> DatadirModification<'a> { ) -> anyhow::Result<()> { let total_blocks = self .tline - .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx) + .get_db_size(spcnode, dbnode, Version::Modified(self), ctx) .await?; // Remove entry from dbdir @@ -1131,21 +1143,22 @@ impl<'a> DatadirModification<'a> { let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?) .context("deserialize db")?; let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { - // Didn't exist. Update dbdir - dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false); - let buf = DbDirectory::ser(&dbdir).context("serialize db")?; - self.pending_directory_entries - .push((DirectoryKind::Db, dbdir.dbdirs.len())); - self.put(DBDIR_KEY, Value::Image(buf.into())); + let mut rel_dir = + if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { + // Didn't exist. Update dbdir + e.insert(false); + let buf = DbDirectory::ser(&dbdir).context("serialize db")?; + self.pending_directory_entries + .push((DirectoryKind::Db, dbdir.dbdirs.len())); + self.put(DBDIR_KEY, Value::Image(buf.into())); - // and create the RelDirectory - RelDirectory::default() - } else { - // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) - .context("deserialize db")? - }; + // and create the RelDirectory + RelDirectory::default() + } else { + // reldir already exists, fetch it + RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) + .context("deserialize db")? + }; // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { @@ -1187,7 +1200,7 @@ impl<'a> DatadirModification<'a> { anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); if self .tline - .get_rel_exists(rel, Version::Modified(self), true, ctx) + .get_rel_exists(rel, Version::Modified(self), ctx) .await? { let size_key = rel_size_to_key(rel); @@ -1401,7 +1414,7 @@ impl<'a> DatadirModification<'a> { let n_files; let mut aux_files = self.tline.aux_files.lock().await; if let Some(mut dir) = aux_files.dir.take() { - // We already updated aux files in `self`: emit a delta and update our latest value + // We already updated aux files in `self`: emit a delta and update our latest value. dir.upsert(file_path.clone(), content.clone()); n_files = dir.files.len(); if aux_files.n_deltas == MAX_AUX_FILE_DELTAS { @@ -1446,10 +1459,14 @@ impl<'a> DatadirModification<'a> { // reset the map. return Err(e.into()); } - // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so - // we are assuming that all _other_ possible errors represents a missing key. If some - // other error occurs, we may incorrectly reset the map of aux files. - Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => { + // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but + // the original code assumes all other errors are missing keys. Therefore, we keep the code path + // the same for now, though in theory, we should only match the `MissingKey` variant. + Err( + PageReconstructError::Other(_) + | PageReconstructError::WalRedo(_) + | PageReconstructError::MissingKey { .. }, + ) => { // Key is missing, we must insert an image as the basis for subsequent deltas. let mut dir = AuxFilesDirectory { @@ -1541,6 +1558,8 @@ impl<'a> DatadirModification<'a> { pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { let mut writer = self.tline.writer().await; + let timer = WAL_INGEST.time_spent_on_ingest.start_timer(); + let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; @@ -1580,6 +1599,8 @@ impl<'a> DatadirModification<'a> { writer.update_directory_entries_count(kind, count as u64); } + timer.observe_duration(); + Ok(()) } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 9959d105eb..0a9ac50aad 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -33,6 +33,52 @@ impl Value { } } +#[cfg(test)] +#[derive(Debug, PartialEq)] +pub(crate) enum InvalidInput { + TooShortValue, + TooShortPostgresRecord, +} + +/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets +/// use this type for querying if a slice looks some particular way. +#[cfg(test)] +pub(crate) struct ValueBytes; + +#[cfg(test)] +impl ValueBytes { + pub(crate) fn will_init(raw: &[u8]) -> Result { + if raw.len() < 12 { + return Err(InvalidInput::TooShortValue); + } + + let value_discriminator = &raw[0..4]; + + if value_discriminator == [0, 0, 0, 0] { + // Value::Image always initializes + return Ok(true); + } + + if value_discriminator != [0, 0, 0, 1] { + // not a Value::WalRecord(..) + return Ok(false); + } + + let walrecord_discriminator = &raw[4..8]; + + if walrecord_discriminator != [0, 0, 0, 0] { + // only NeonWalRecord::Postgres can have will_init + return Ok(false); + } + + if raw.len() < 17 { + return Err(InvalidInput::TooShortPostgresRecord); + } + + Ok(raw[8] == 1) + } +} + #[cfg(test)] mod test { use super::*; @@ -70,6 +116,8 @@ mod test { ]; roundtrip!(image, expected); + + assert!(ValueBytes::will_init(&expected).unwrap()); } #[test] @@ -93,6 +141,96 @@ mod test { ]; roundtrip!(rec, expected); + + assert!(ValueBytes::will_init(&expected).unwrap()); + } + + #[test] + fn bytes_inspection_too_short_image() { + let rec = Value::Image(Bytes::from_static(b"")); + + #[rustfmt::skip] + let expected = [ + // top level discriminator of 4 bytes + 0x00, 0x00, 0x00, 0x00, + // 8 byte length + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + + roundtrip!(rec, expected); + + assert!(ValueBytes::will_init(&expected).unwrap()); + assert_eq!(expected.len(), 12); + for len in 0..12 { + assert_eq!( + ValueBytes::will_init(&expected[..len]).unwrap_err(), + InvalidInput::TooShortValue + ); + } + } + + #[test] + fn bytes_inspection_too_short_postgres_record() { + let rec = NeonWalRecord::Postgres { + will_init: false, + rec: Bytes::from_static(b""), + }; + let rec = Value::WalRecord(rec); + + #[rustfmt::skip] + let expected = [ + // flattened discriminator of total 8 bytes + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, + // will_init + 0x00, + // 8 byte length + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + + roundtrip!(rec, expected); + + assert!(!ValueBytes::will_init(&expected).unwrap()); + assert_eq!(expected.len(), 17); + for len in 12..17 { + assert_eq!( + ValueBytes::will_init(&expected[..len]).unwrap_err(), + InvalidInput::TooShortPostgresRecord + ) + } + for len in 0..12 { + assert_eq!( + ValueBytes::will_init(&expected[..len]).unwrap_err(), + InvalidInput::TooShortValue + ) + } + } + + #[test] + fn clear_visibility_map_flags_example() { + let rec = NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno: Some(0x11), + old_heap_blkno: None, + flags: 0x03, + }; + let rec = Value::WalRecord(rec); + + #[rustfmt::skip] + let expected = [ + // discriminators + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x01, + // Some == 1 followed by 4 bytes + 0x01, 0x00, 0x00, 0x00, 0x11, + // None == 0 + 0x00, + // flags + 0x03 + ]; + + roundtrip!(rec, expected); + + assert!(!ValueBytes::will_init(&expected).unwrap()); } } diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 9a1e354ecf..0c245580ee 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -361,6 +361,10 @@ pub enum TaskKind { DebugTool, + EphemeralFilePreWarmPageCache, + + LayerDownload, + #[cfg(test)] UnitTest, } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 17ff033e00..fdc49ae295 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -64,6 +64,7 @@ use self::timeline::uninit::UninitializedTimeline; use self::timeline::EvictionTaskTenantState; use self::timeline::TimelineResources; use self::timeline::WaitLsnError; +use self::timeline::{GcCutoffs, GcInfo}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; @@ -86,7 +87,6 @@ use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; use crate::InitializationOrder; -use std::cmp::min; use std::collections::hash_map::Entry; use std::collections::BTreeSet; use std::collections::HashMap; @@ -386,7 +386,7 @@ impl WalRedoManager { pub(crate) fn status(&self) -> Option { match self { - WalRedoManager::Prod(m) => m.status(), + WalRedoManager::Prod(m) => Some(m.status()), #[cfg(test)] WalRedoManager::Test(_) => None, } @@ -559,9 +559,10 @@ impl Tenant { // By doing what we do here, the index part upload is retried. // If control plane retries timeline creation in the meantime, the mgmt API handler // for timeline creation will coalesce on the upload we queue here. + // FIXME: this branch should be dead code as we no longer write local metadata. let rtc = timeline.remote_client.as_ref().unwrap(); rtc.init_upload_queue_for_empty_remote(&metadata)?; - rtc.schedule_index_upload_for_metadata_update(&metadata)?; + rtc.schedule_index_upload_for_full_metadata_update(&metadata)?; } timeline @@ -887,7 +888,7 @@ impl Tenant { #[instrument(skip_all)] pub(crate) async fn preload( - self: &Arc, + self: &Arc, remote_storage: &GenericRemoteStorage, cancel: CancellationToken, ) -> anyhow::Result { @@ -917,9 +918,13 @@ impl Tenant { Ok(TenantPreload { deleting, - timelines: self - .load_timeline_metadata(remote_timeline_ids, remote_storage, cancel) - .await?, + timelines: Self::load_timeline_metadata( + self, + remote_timeline_ids, + remote_storage, + cancel, + ) + .await?, }) } @@ -2807,7 +2812,48 @@ impl Tenant { cancel: &CancellationToken, ctx: &RequestContext, ) -> anyhow::Result>> { - // grab mutex to prevent new timelines from being created here. + // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for + // currently visible timelines. + let timelines = self + .timelines + .lock() + .unwrap() + .values() + .filter(|tl| match target_timeline_id.as_ref() { + Some(target) => &tl.timeline_id == target, + None => true, + }) + .cloned() + .collect::>(); + + let mut gc_cutoffs: HashMap = + HashMap::with_capacity(timelines.len()); + + for timeline in timelines.iter() { + let cutoff = timeline + .get_last_record_lsn() + .checked_sub(horizon) + .unwrap_or(Lsn(0)); + + let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await; + + match res { + Ok(cutoffs) => { + let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs); + assert!(old.is_none()); + } + Err(e) => { + tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}"); + } + } + } + + if !self.is_active() { + anyhow::bail!("shutting down"); + } + + // grab mutex to prevent new timelines from being created here; avoid doing long operations + // because that will stall branch creation. let gc_cs = self.gc_cs.lock().await; // Scan all timelines. For each timeline, remember the timeline ID and @@ -2869,20 +2915,36 @@ impl Tenant { } } - if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) { - let branchpoints: Vec = all_branchpoints - .range(( - Included((timeline_id, Lsn(0))), - Included((timeline_id, Lsn(u64::MAX))), - )) - .map(|&x| x.1) - .collect(); - timeline - .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx) - .await?; + let branchpoints: Vec = all_branchpoints + .range(( + Included((timeline_id, Lsn(0))), + Included((timeline_id, Lsn(u64::MAX))), + )) + .map(|&x| x.1) + .collect(); - gc_timelines.push(timeline); + { + let mut target = timeline.gc_info.write().unwrap(); + + match gc_cutoffs.remove(&timeline_id) { + Some(cutoffs) => { + *target = GcInfo { + retain_lsns: branchpoints, + cutoffs, + }; + } + None => { + // reasons for this being unavailable: + // - this timeline was created while we were finding cutoffs + // - lsn for timestamp search fails for this timeline repeatedly + // + // in both cases, refreshing the branchpoints is correct. + target.retain_lsns = branchpoints; + } + }; } + + gc_timelines.push(timeline); } drop(gc_cs); Ok(gc_timelines) @@ -2969,7 +3031,7 @@ impl Tenant { // and then the planned GC cutoff { let gc_info = src_timeline.gc_info.read().unwrap(); - let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); + let cutoff = gc_info.min_cutoff(); if start_lsn < cutoff { return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!( "invalid branch start lsn: less than planned GC cutoff {cutoff}" @@ -3027,7 +3089,7 @@ impl Tenant { // See also https://github.com/neondatabase/neon/issues/3865 if let Some(remote_client) = new_timeline.remote_client.as_ref() { remote_client - .schedule_index_upload_for_metadata_update(&metadata) + .schedule_index_upload_for_full_metadata_update(&metadata) .context("branch initial metadata upload")?; } @@ -3190,7 +3252,7 @@ impl Tenant { run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?; // Upload the created data dir to S3 - if self.tenant_shard_id().is_zero() { + if self.tenant_shard_id().is_shard_zero() { self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id) .await?; } @@ -3398,7 +3460,11 @@ impl Tenant { // is in progress (which is not a common case). // // See more for on the issue #2748 condenced out of the initial PR review. - let mut shared_cache = self.cached_logical_sizes.lock().await; + let mut shared_cache = tokio::select! { + locked = self.cached_logical_sizes.lock() => locked, + _ = cancel.cancelled() => anyhow::bail!("cancelled"), + _ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"), + }; size::gather_inputs( self, @@ -3437,7 +3503,7 @@ impl Tenant { .store(size, Ordering::Relaxed); // Only shard zero should be calculating synthetic sizes - debug_assert!(self.shard_identity.is_zero()); + debug_assert!(self.shard_identity.is_shard_zero()); TENANT_SYNTHETIC_SIZE_METRIC .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()]) @@ -3660,6 +3726,7 @@ pub(crate) mod harness { image_layer_creation_check_threshold: Some( tenant_conf.image_layer_creation_check_threshold, ), + switch_to_aux_file_v2: Some(tenant_conf.switch_to_aux_file_v2), } } } @@ -3848,6 +3915,8 @@ pub(crate) mod harness { #[cfg(test)] mod tests { + use std::collections::BTreeMap; + use super::*; use crate::keyspace::KeySpaceAccum; use crate::repository::{Key, Value}; @@ -3856,9 +3925,12 @@ mod tests { use crate::DEFAULT_PG_VERSION; use bytes::BytesMut; use hex_literal::hex; + use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; + use pageserver_api::models::CompactionAlgorithm; use rand::{thread_rng, Rng}; - use tests::timeline::ShutdownMode; + use tests::storage_layer::ValuesReconstructState; + use tests::timeline::{GetVectoredError, ShutdownMode}; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); @@ -4495,11 +4567,25 @@ mod tests { } async fn bulk_insert_compact_gc( - timeline: Arc, + tenant: &Tenant, + timeline: &Arc, + ctx: &RequestContext, + lsn: Lsn, + repeat: usize, + key_count: usize, + ) -> anyhow::Result<()> { + let compact = true; + bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await + } + + async fn bulk_insert_maybe_compact_gc( + tenant: &Tenant, + timeline: &Arc, ctx: &RequestContext, mut lsn: Lsn, repeat: usize, key_count: usize, + compact: bool, ) -> anyhow::Result<()> { let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let mut blknum = 0; @@ -4507,6 +4593,8 @@ mod tests { // Enforce that key range is monotonously increasing let mut keyspace = KeySpaceAccum::new(); + let cancel = CancellationToken::new(); + for _ in 0..repeat { for _ in 0..key_count { test_key.field6 = blknum; @@ -4528,22 +4616,19 @@ mod tests { blknum += 1; } - let cutoff = timeline.get_last_record_lsn(); - - timeline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), - ctx, - ) - .await?; timeline.freeze_and_flush().await?; - timeline - .compact(&CancellationToken::new(), EnumSet::empty(), ctx) + if compact { + // this requires timeline to be &Arc + timeline.compact(&cancel, EnumSet::empty(), ctx).await?; + } + + // this doesn't really need to use the timeline_id target, but it is closer to what it + // originally was. + let res = tenant + .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx) .await?; - timeline.gc().await?; + + assert_eq!(res.layers_removed, 0, "this never removes anything"); } Ok(()) @@ -4562,7 +4647,7 @@ mod tests { .await?; let lsn = Lsn(0x10); - bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?; + bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; Ok(()) } @@ -4593,7 +4678,7 @@ mod tests { .await?; let lsn = Lsn(0x10); - bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?; + bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; let guard = tline.layers.read().await; guard.layer_map().dump(true, &ctx).await?; @@ -4646,7 +4731,9 @@ mod tests { for read in reads { info!("Doing vectored read on {:?}", read); - let vectored_res = tline.get_vectored_impl(read.clone(), reads_lsn, &ctx).await; + let vectored_res = tline + .get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx) + .await; tline .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx) .await; @@ -4655,6 +4742,59 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_get_vectored_aux_files() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_get_vectored_aux_files")?; + + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) + .await?; + let tline = tline.raw_timeline().unwrap(); + + let mut modification = tline.begin_modification(Lsn(0x1000)); + modification.put_file("foo/bar1", b"content1", &ctx).await?; + modification.set_lsn(Lsn(0x1008))?; + modification.put_file("foo/bar2", b"content2", &ctx).await?; + modification.commit(&ctx).await?; + + let child_timeline_id = TimelineId::generate(); + tenant + .branch_timeline_test( + tline, + child_timeline_id, + Some(tline.get_last_record_lsn()), + &ctx, + ) + .await?; + + let child_timeline = tenant + .get_timeline(child_timeline_id, true) + .expect("Should have the branched timeline"); + + let aux_keyspace = KeySpace { + ranges: vec![NON_INHERITED_RANGE], + }; + let read_lsn = child_timeline.get_last_record_lsn(); + + let vectored_res = child_timeline + .get_vectored_impl( + aux_keyspace.clone(), + read_lsn, + ValuesReconstructState::new(), + &ctx, + ) + .await; + + child_timeline + .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx) + .await; + + let images = vectored_res?; + assert!(images.is_empty()); + Ok(()) + } + // Test that vectored get handles layer gaps correctly // by advancing into the next ancestor timeline if required. // @@ -4783,7 +4923,12 @@ mod tests { ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key], }; let results = child_timeline - .get_vectored_impl(read.clone(), current_lsn, &ctx) + .get_vectored_impl( + read.clone(), + current_lsn, + ValuesReconstructState::new(), + &ctx, + ) .await?; for (key, img_res) in results { @@ -4794,15 +4939,192 @@ mod tests { Ok(()) } + // Test that vectored get descends into ancestor timelines correctly and + // does not return an image that's newer than requested. + // + // The diagram below ilustrates an interesting case. We have a parent timeline + // (top of the Lsn range) and a child timeline. The request key cannot be reconstructed + // from the child timeline, so the parent timeline must be visited. When advacing into + // the child timeline, the read path needs to remember what the requested Lsn was in + // order to avoid returning an image that's too new. The test below constructs such + // a timeline setup and does a few queries around the Lsn of each page image. + // ``` + // LSN + // ^ + // | + // | + // 500 | --------------------------------------> branch point + // 400 | X + // 300 | X + // 200 | --------------------------------------> requested lsn + // 100 | X + // |---------------------------------------> Key + // | + // ------> requested key + // + // Legend: + // * X - page images + // ``` + #[tokio::test] + async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?; + let (tenant, ctx) = harness.load().await; + + let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let end_key = start_key.add(1000); + let child_gap_at_key = start_key.add(500); + let mut parent_gap_lsns: BTreeMap = BTreeMap::new(); + + let mut current_lsn = Lsn(0x10); + + let timeline_id = TimelineId::generate(); + let parent_timeline = tenant + .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx) + .await?; + + current_lsn += 0x100; + + for _ in 0..3 { + let mut key = start_key; + while key < end_key { + current_lsn += 0x10; + + let image_value = format!("{} at {}", child_gap_at_key, current_lsn); + + let mut writer = parent_timeline.writer().await; + writer + .put( + key, + current_lsn, + &Value::Image(test_img(&image_value)), + &ctx, + ) + .await?; + writer.finish_write(current_lsn); + + if key == child_gap_at_key { + parent_gap_lsns.insert(current_lsn, image_value); + } + + key = key.next(); + } + + parent_timeline.freeze_and_flush().await?; + } + + let child_timeline_id = TimelineId::generate(); + + let child_timeline = tenant + .branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx) + .await?; + + let mut key = start_key; + while key < end_key { + if key == child_gap_at_key { + key = key.next(); + continue; + } + + current_lsn += 0x10; + + let mut writer = child_timeline.writer().await; + writer + .put( + key, + current_lsn, + &Value::Image(test_img(&format!("{} at {}", key, current_lsn))), + &ctx, + ) + .await?; + writer.finish_write(current_lsn); + + key = key.next(); + } + + child_timeline.freeze_and_flush().await?; + + let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10]; + let mut query_lsns = Vec::new(); + for image_lsn in parent_gap_lsns.keys().rev() { + for offset in lsn_offsets { + query_lsns.push(Lsn(image_lsn + .0 + .checked_add_signed(offset) + .expect("Shouldn't overflow"))); + } + } + + for query_lsn in query_lsns { + let results = child_timeline + .get_vectored_impl( + KeySpace { + ranges: vec![child_gap_at_key..child_gap_at_key.next()], + }, + query_lsn, + ValuesReconstructState::new(), + &ctx, + ) + .await; + + let expected_item = parent_gap_lsns + .iter() + .rev() + .find(|(lsn, _)| **lsn <= query_lsn); + + info!( + "Doing vectored read at LSN {}. Expecting image to be: {:?}", + query_lsn, expected_item + ); + + match expected_item { + Some((_, img_value)) => { + let key_results = results.expect("No vectored get error expected"); + let key_result = &key_results[&child_gap_at_key]; + let returned_img = key_result + .as_ref() + .expect("No page reconstruct error expected"); + + info!( + "Vectored read at LSN {} returned image {}", + query_lsn, + std::str::from_utf8(returned_img)? + ); + assert_eq!(*returned_img, test_img(img_value)); + } + None => { + assert!(matches!(results, Err(GetVectoredError::MissingKey(_)))); + } + } + } + + Ok(()) + } + #[tokio::test] async fn test_random_updates() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_random_updates")?; + let names_algorithms = [ + ("test_random_updates_legacy", CompactionAlgorithm::Legacy), + ("test_random_updates_tiered", CompactionAlgorithm::Tiered), + ]; + for (name, algorithm) in names_algorithms { + test_random_updates_algorithm(name, algorithm).await?; + } + Ok(()) + } + + async fn test_random_updates_algorithm( + name: &'static str, + compaction_algorithm: CompactionAlgorithm, + ) -> anyhow::Result<()> { + let mut harness = TenantHarness::create(name)?; + harness.tenant_conf.compaction_algorithm = compaction_algorithm; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; const NUM_KEYS: usize = 1000; + let cancel = CancellationToken::new(); let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); @@ -4861,22 +5183,11 @@ mod tests { ); } - // Perform a cycle of flush, compact, and GC - let cutoff = tline.get_last_record_lsn(); - tline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), - &ctx, - ) - .await?; + // Perform a cycle of flush, and GC tline.freeze_and_flush().await?; - tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; - tline.gc().await?; } Ok(()) @@ -4897,6 +5208,8 @@ mod tests { let mut keyspace = KeySpaceAccum::new(); + let cancel = CancellationToken::new(); + // Track when each page was last modified. Used to assert that // a read sees the latest page version. let mut updated = [Lsn(0); NUM_KEYS]; @@ -4960,21 +5273,11 @@ mod tests { } // Perform a cycle of flush, compact, and GC - let cutoff = tline.get_last_record_lsn(); - tline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), - &ctx, - ) - .await?; tline.freeze_and_flush().await?; - tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; - tline.gc().await?; } Ok(()) @@ -5156,19 +5459,140 @@ mod tests { #[tokio::test] async fn test_read_at_max_lsn() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_read_at_max_lsn")?; + let names_algorithms = [ + ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy), + ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered), + ]; + for (name, algorithm) in names_algorithms { + test_read_at_max_lsn_algorithm(name, algorithm).await?; + } + Ok(()) + } + + async fn test_read_at_max_lsn_algorithm( + name: &'static str, + compaction_algorithm: CompactionAlgorithm, + ) -> anyhow::Result<()> { + let mut harness = TenantHarness::create(name)?; + harness.tenant_conf.compaction_algorithm = compaction_algorithm; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; let lsn = Lsn(0x10); - bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?; + let compact = false; + bulk_insert_maybe_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000, compact).await?; let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let read_lsn = Lsn(u64::MAX - 1); - assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok()); + let result = tline.get(test_key, read_lsn, &ctx).await; + assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err()); + + Ok(()) + } + + #[tokio::test] + async fn test_metadata_scan() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_scan")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + const NUM_KEYS: usize = 1000; + const STEP: usize = 100; // random update + scan base_key + idx * STEP + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + let mut test_key = base_key; + + // Track when each page was last modified. Used to assert that + // a read sees the latest page version. + let mut updated = [Lsn(0); NUM_KEYS]; + + let mut lsn = Lsn(0x10); + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + updated[blknum] = lsn; + drop(writer); + } + + let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); + + for _ in 0..10 { + // Read all the blocks + for (blknum, last_lsn) in updated.iter().enumerate() { + test_key.field6 = (blknum * STEP) as u32; + assert_eq!( + tline.get(test_key, lsn, &ctx).await?, + test_img(&format!("{} at {}", blknum, last_lsn)) + ); + } + + let mut cnt = 0; + for (key, value) in tline + .get_vectored_impl( + keyspace.clone(), + lsn, + ValuesReconstructState::default(), + &ctx, + ) + .await? + { + let blknum = key.field6 as usize; + let value = value?; + assert!(blknum % STEP == 0); + let blknum = blknum / STEP; + assert_eq!( + value, + test_img(&format!("{} at {}", blknum, updated[blknum])) + ); + cnt += 1; + } + + assert_eq!(cnt, NUM_KEYS); + + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + updated[blknum] = lsn; + } + + // Perform a cycle of flush, compact, and GC + tline.freeze_and_flush().await?; + tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .await?; + } Ok(()) } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 0d33100ead..1dc451f5c9 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -121,7 +121,7 @@ impl BlobWriter { self.offset } - const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 }; + const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 }; /// Writes the given buffer directly to the underlying `VirtualFile`. /// You need to make sure that the internal buffer is empty, otherwise @@ -130,8 +130,9 @@ impl BlobWriter { async fn write_all_unbuffered, Buf: IoBuf + Send>( &mut self, src_buf: B, + ctx: &RequestContext, ) -> (B::Buf, Result<(), Error>) { - let (src_buf, res) = self.inner.write_all(src_buf).await; + let (src_buf, res) = self.inner.write_all(src_buf, ctx).await; let nbytes = match res { Ok(nbytes) => nbytes, Err(e) => return (src_buf, Err(e)), @@ -142,9 +143,9 @@ impl BlobWriter { #[inline(always)] /// Flushes the internal buffer to the underlying `VirtualFile`. - pub async fn flush_buffer(&mut self) -> Result<(), Error> { + pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> { let buf = std::mem::take(&mut self.buf); - let (mut buf, res) = self.inner.write_all(buf).await; + let (mut buf, res) = self.inner.write_all(buf, ctx).await; res?; buf.clear(); self.buf = buf; @@ -165,10 +166,11 @@ impl BlobWriter { async fn write_all, Buf: IoBuf + Send>( &mut self, src_buf: B, + ctx: &RequestContext, ) -> (B::Buf, Result<(), Error>) { if !BUFFERED { assert!(self.buf.is_empty()); - return self.write_all_unbuffered(src_buf).await; + return self.write_all_unbuffered(src_buf, ctx).await; } let remaining = Self::CAPACITY - self.buf.len(); let src_buf_len = src_buf.bytes_init(); @@ -183,7 +185,7 @@ impl BlobWriter { } // Then, if the buffer is full, flush it out if self.buf.len() == Self::CAPACITY { - if let Err(e) = self.flush_buffer().await { + if let Err(e) = self.flush_buffer(ctx).await { return (Slice::into_inner(src_buf), Err(e)); } } @@ -199,7 +201,7 @@ impl BlobWriter { assert_eq!(copied, src_buf.len()); Slice::into_inner(src_buf) } else { - let (src_buf, res) = self.write_all_unbuffered(src_buf).await; + let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await; if let Err(e) = res { return (src_buf, Err(e)); } @@ -216,6 +218,7 @@ impl BlobWriter { pub async fn write_blob, Buf: IoBuf + Send>( &mut self, srcbuf: B, + ctx: &RequestContext, ) -> (B::Buf, Result) { let offset = self.offset; @@ -227,7 +230,7 @@ impl BlobWriter { if len < 128 { // Short blob. Write a 1-byte length header io_buf.put_u8(len as u8); - self.write_all(io_buf).await + self.write_all(io_buf, ctx).await } else { // Write a 4-byte length header if len > 0x7fff_ffff { @@ -242,7 +245,7 @@ impl BlobWriter { let mut len_buf = (len as u32).to_be_bytes(); len_buf[0] |= 0x80; io_buf.extend_from_slice(&len_buf[..]); - self.write_all(io_buf).await + self.write_all(io_buf, ctx).await } } .await; @@ -251,7 +254,7 @@ impl BlobWriter { Ok(_) => (), Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)), } - let (srcbuf, res) = self.write_all(srcbuf).await; + let (srcbuf, res) = self.write_all(srcbuf, ctx).await; (srcbuf, res.map(|_| offset)) } } @@ -261,8 +264,8 @@ impl BlobWriter { /// /// This function flushes the internal buffer before giving access /// to the underlying `VirtualFile`. - pub async fn into_inner(mut self) -> Result { - self.flush_buffer().await?; + pub async fn into_inner(mut self, ctx: &RequestContext) -> Result { + self.flush_buffer(ctx).await?; Ok(self.inner) } @@ -299,16 +302,16 @@ mod tests { let file = VirtualFile::create(pathbuf.as_path()).await?; let mut wtr = BlobWriter::::new(file, 0); for blob in blobs.iter() { - let (_, res) = wtr.write_blob(blob.clone()).await; + let (_, res) = wtr.write_blob(blob.clone(), &ctx).await; let offs = res?; offsets.push(offs); } // Write out one page worth of zeros so that we can // read again with read_blk - let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await; + let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await; let offs = res?; println!("Writing final blob at offs={offs}"); - wtr.flush_buffer().await?; + wtr.flush_buffer(&ctx).await?; } let file = VirtualFile::open(pathbuf.as_path()).await?; diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index a2bb479f63..9975c9edbc 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -369,6 +369,10 @@ pub struct TenantConf { // How much WAL must be ingested before checking again whether a new image layer is required. // Expresed in multiples of checkpoint distance. pub image_layer_creation_check_threshold: u8, + + /// Switch to aux file v2. Switching this flag requires the user has not written any aux file into + /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions. + pub switch_to_aux_file_v2: bool, } /// Same as TenantConf, but this struct preserves the information about @@ -464,6 +468,10 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] pub image_layer_creation_check_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub switch_to_aux_file_v2: Option, } impl TenantConfOpt { @@ -521,6 +529,9 @@ impl TenantConfOpt { image_layer_creation_check_threshold: self .image_layer_creation_check_threshold .unwrap_or(global_conf.image_layer_creation_check_threshold), + switch_to_aux_file_v2: self + .switch_to_aux_file_v2 + .unwrap_or(global_conf.switch_to_aux_file_v2), } } } @@ -562,6 +573,7 @@ impl Default for TenantConf { lazy_slru_download: false, timeline_get_throttle: crate::tenant::throttle::Config::disabled(), image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, + switch_to_aux_file_v2: false, } } } @@ -636,6 +648,7 @@ impl From for models::TenantConfig { lazy_slru_download: value.lazy_slru_download, timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from), image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, + switch_to_aux_file_v2: value.switch_to_aux_file_v2, } } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index d1881f3897..33d0f677e5 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -436,6 +436,11 @@ impl DeleteTenantFlow { .await } + /// Check whether background deletion of this tenant is currently in progress + pub(crate) fn is_in_progress(tenant: &Tenant) -> bool { + tenant.delete_progress.try_lock().is_err() + } + async fn prepare( tenant: &Arc, ) -> Result, DeleteTenantError> { diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index b27230db03..8b815a1885 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -3,36 +3,26 @@ use crate::config::PageServerConf; use crate::context::RequestContext; -use crate::page_cache::{self, PAGE_SZ}; +use crate::page_cache; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader}; use crate::virtual_file::{self, VirtualFile}; -use bytes::BytesMut; use camino::Utf8PathBuf; use pageserver_api::shard::TenantShardId; -use std::cmp::min; -use std::io::{self, ErrorKind}; -use std::ops::DerefMut; +use std::io; use std::sync::atomic::AtomicU64; -use tracing::*; use utils::id::TimelineId; pub struct EphemeralFile { - page_cache_file_id: page_cache::FileId, - _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, - file: VirtualFile, - len: u64, - /// An ephemeral file is append-only. - /// We keep the last page, which can still be modified, in [`Self::mutable_tail`]. - /// The other pages, which can no longer be modified, are accessed through the page cache. - /// - /// None <=> IO is ongoing. - /// Size is fixed to PAGE_SZ at creation time and must not be changed. - mutable_tail: Option, + + rw: page_caching::RW, } +mod page_caching; +mod zero_padded_read_write; + impl EphemeralFile { pub async fn create( conf: &PageServerConf, @@ -59,21 +49,18 @@ impl EphemeralFile { .await?; Ok(EphemeralFile { - page_cache_file_id: page_cache::next_file_id(), _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, - file, - len: 0, - mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)), + rw: page_caching::RW::new(file), }) } pub(crate) fn len(&self) -> u64 { - self.len + self.rw.bytes_written() } - pub(crate) fn id(&self) -> page_cache::FileId { - self.page_cache_file_id + pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId { + self.rw.page_cache_file_id() } pub(crate) async fn read_blk( @@ -81,44 +68,7 @@ impl EphemeralFile { blknum: u32, ctx: &RequestContext, ) -> Result { - let flushed_blknums = 0..self.len / PAGE_SZ as u64; - if flushed_blknums.contains(&(blknum as u64)) { - let cache = page_cache::get(); - match cache - .read_immutable_buf(self.page_cache_file_id, blknum, ctx) - .await - .map_err(|e| { - std::io::Error::new( - std::io::ErrorKind::Other, - // order path before error because error is anyhow::Error => might have many contexts - format!( - "ephemeral file: read immutable page #{}: {}: {:#}", - blknum, self.file.path, e, - ), - ) - })? { - page_cache::ReadBufResult::Found(guard) => { - return Ok(BlockLease::PageReadGuard(guard)) - } - page_cache::ReadBufResult::NotFound(write_guard) => { - let write_guard = self - .file - .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64) - .await?; - let read_guard = write_guard.mark_valid(); - return Ok(BlockLease::PageReadGuard(read_guard)); - } - }; - } else { - debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64); - Ok(BlockLease::EphemeralFileMutableTail( - self.mutable_tail - .as_deref() - .expect("we're not doing IO, it must be Some()") - .try_into() - .expect("we ensure that it's always PAGE_SZ"), - )) - } + self.rw.read_blk(blknum, ctx).await } pub(crate) async fn write_blob( @@ -126,137 +76,22 @@ impl EphemeralFile { srcbuf: &[u8], ctx: &RequestContext, ) -> Result { - struct Writer<'a> { - ephemeral_file: &'a mut EphemeralFile, - /// The block to which the next [`push_bytes`] will write. - blknum: u32, - /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write. - off: usize, - } - impl<'a> Writer<'a> { - fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result> { - Ok(Writer { - blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32, - off: (ephemeral_file.len % PAGE_SZ as u64) as usize, - ephemeral_file, - }) - } - #[inline(always)] - async fn push_bytes( - &mut self, - src: &[u8], - ctx: &RequestContext, - ) -> Result<(), io::Error> { - let mut src_remaining = src; - while !src_remaining.is_empty() { - let dst_remaining = &mut self - .ephemeral_file - .mutable_tail - .as_deref_mut() - .expect("IO is not yet ongoing")[self.off..]; - let n = min(dst_remaining.len(), src_remaining.len()); - dst_remaining[..n].copy_from_slice(&src_remaining[..n]); - self.off += n; - src_remaining = &src_remaining[n..]; - if self.off == PAGE_SZ { - let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail) - .expect("IO is not yet ongoing"); - let (mutable_tail, res) = self - .ephemeral_file - .file - .write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64) - .await; - // TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail. - // I.e., the IO isn't retryable if we panic. - self.ephemeral_file.mutable_tail = Some(mutable_tail); - match res { - Ok(_) => { - // Pre-warm the page cache with what we just wrote. - // This isn't necessary for coherency/correctness, but it's how we've always done it. - let cache = page_cache::get(); - match cache - .read_immutable_buf( - self.ephemeral_file.page_cache_file_id, - self.blknum, - ctx, - ) - .await - { - Ok(page_cache::ReadBufResult::Found(_guard)) => { - // This function takes &mut self, so, it shouldn't be possible to reach this point. - unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum); - } - Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => { - let buf: &mut [u8] = write_guard.deref_mut(); - debug_assert_eq!(buf.len(), PAGE_SZ); - buf.copy_from_slice( - self.ephemeral_file - .mutable_tail - .as_deref() - .expect("IO is not ongoing"), - ); - let _ = write_guard.mark_valid(); - // pre-warm successful - } - Err(e) => { - error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}"); - // fail gracefully, it's not the end of the world if we can't pre-warm the cache here - } - } - // Zero the buffer for re-use. - // Zeroing is critical for correcntess because the write_blob code below - // and similarly read_blk expect zeroed pages. - self.ephemeral_file - .mutable_tail - .as_deref_mut() - .expect("IO is not ongoing") - .fill(0); - // This block is done, move to next one. - self.blknum += 1; - self.off = 0; - } - Err(e) => { - return Err(std::io::Error::new( - ErrorKind::Other, - // order error before path because path is long and error is short - format!( - "ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}", - self.blknum, - e, - self.ephemeral_file.file.path, - ), - )); - } - } - } - } - Ok(()) - } - } - - let pos = self.len; - let mut writer = Writer::new(self)?; + let pos = self.rw.bytes_written(); // Write the length field if srcbuf.len() < 0x80 { // short one-byte length header let len_buf = [srcbuf.len() as u8]; - writer.push_bytes(&len_buf, ctx).await?; + + self.rw.write_all_borrowed(&len_buf, ctx).await?; } else { let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32); len_buf[0] |= 0x80; - writer.push_bytes(&len_buf, ctx).await?; + self.rw.write_all_borrowed(&len_buf, ctx).await?; } // Write the payload - writer.push_bytes(srcbuf, ctx).await?; - - if srcbuf.len() < 0x80 { - self.len += 1; - } else { - self.len += 4; - } - self.len += srcbuf.len() as u64; + self.rw.write_all_borrowed(srcbuf, ctx).await?; Ok(pos) } @@ -271,28 +106,6 @@ pub fn is_ephemeral_file(filename: &str) -> bool { } } -impl Drop for EphemeralFile { - fn drop(&mut self) { - // There might still be pages in the [`crate::page_cache`] for this file. - // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed. - - // unlink the file - let res = std::fs::remove_file(&self.file.path); - if let Err(e) = res { - if e.kind() != std::io::ErrorKind::NotFound { - // just never log the not found errors, we cannot do anything for them; on detach - // the tenant directory is already gone. - // - // not found files might also be related to https://github.com/neondatabase/neon/issues/2442 - error!( - "could not remove ephemeral file '{}': {}", - self.file.path, e - ); - } - } - } -} - impl BlockReader for EphemeralFile { fn block_cursor(&self) -> super::block_io::BlockCursor<'_> { BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self)) diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs new file mode 100644 index 0000000000..42def8858e --- /dev/null +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -0,0 +1,223 @@ +//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the +//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`]. + +use crate::context::RequestContext; +use crate::page_cache::{self, PAGE_SZ}; +use crate::tenant::block_io::BlockLease; +use crate::virtual_file::VirtualFile; + +use once_cell::sync::Lazy; +use std::io::{self, ErrorKind}; +use tokio_epoll_uring::BoundedBuf; +use tracing::*; + +use super::zero_padded_read_write; + +/// See module-level comment. +pub struct RW { + page_cache_file_id: page_cache::FileId, + rw: super::zero_padded_read_write::RW, +} + +impl RW { + pub fn new(file: VirtualFile) -> Self { + let page_cache_file_id = page_cache::next_file_id(); + Self { + page_cache_file_id, + rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new( + page_cache_file_id, + file, + )), + } + } + + pub fn page_cache_file_id(&self) -> page_cache::FileId { + self.page_cache_file_id + } + + pub(crate) async fn write_all_borrowed( + &mut self, + srcbuf: &[u8], + ctx: &RequestContext, + ) -> Result { + // It doesn't make sense to proactively fill the page cache on the Pageserver write path + // because Compute is unlikely to access recently written data. + self.rw.write_all_borrowed(srcbuf, ctx).await + } + + pub(crate) fn bytes_written(&self) -> u64 { + self.rw.bytes_written() + } + + pub(crate) async fn read_blk( + &self, + blknum: u32, + ctx: &RequestContext, + ) -> Result { + match self.rw.read_blk(blknum).await? { + zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => { + let cache = page_cache::get(); + match cache + .read_immutable_buf(self.page_cache_file_id, blknum, ctx) + .await + .map_err(|e| { + std::io::Error::new( + std::io::ErrorKind::Other, + // order path before error because error is anyhow::Error => might have many contexts + format!( + "ephemeral file: read immutable page #{}: {}: {:#}", + blknum, + self.rw.as_writer().file.path, + e, + ), + ) + })? { + page_cache::ReadBufResult::Found(guard) => { + return Ok(BlockLease::PageReadGuard(guard)) + } + page_cache::ReadBufResult::NotFound(write_guard) => { + let write_guard = writer + .file + .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64) + .await?; + let read_guard = write_guard.mark_valid(); + return Ok(BlockLease::PageReadGuard(read_guard)); + } + } + } + zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => { + Ok(BlockLease::EphemeralFileMutableTail(buffer)) + } + } + } +} + +impl Drop for RW { + fn drop(&mut self) { + // There might still be pages in the [`crate::page_cache`] for this file. + // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed. + + // unlink the file + let res = std::fs::remove_file(&self.rw.as_writer().file.path); + if let Err(e) = res { + if e.kind() != std::io::ErrorKind::NotFound { + // just never log the not found errors, we cannot do anything for them; on detach + // the tenant directory is already gone. + // + // not found files might also be related to https://github.com/neondatabase/neon/issues/2442 + error!( + "could not remove ephemeral file '{}': {}", + self.rw.as_writer().file.path, + e + ); + } + } + } +} + +struct PreWarmingWriter { + nwritten_blocks: u32, + page_cache_file_id: page_cache::FileId, + file: VirtualFile, +} + +impl PreWarmingWriter { + fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self { + Self { + nwritten_blocks: 0, + page_cache_file_id, + file, + } + } +} + +impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter { + async fn write_all< + B: tokio_epoll_uring::BoundedBuf, + Buf: tokio_epoll_uring::IoBuf + Send, + >( + &mut self, + buf: B, + ctx: &RequestContext, + ) -> std::io::Result<(usize, B::Buf)> { + let buf = buf.slice(..); + let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done + let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) { + Some(buf.to_vec()) + } else { + None + }; + let buflen = buf.len(); + assert_eq!( + buflen % PAGE_SZ, + 0, + "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used" + ); + + // Do the IO. + let iobuf = match self.file.write_all(buf, ctx).await { + (iobuf, Ok(nwritten)) => { + assert_eq!(nwritten, buflen); + iobuf + } + (_, Err(e)) => { + return Err(std::io::Error::new( + ErrorKind::Other, + // order error before path because path is long and error is short + format!( + "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}", + self.nwritten_blocks, buflen, e, self.file.path, + ), + )); + } + }; + + // Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf) + let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds); + if let Some(check_bounds_stuff_works) = check_bounds_stuff_works { + assert_eq!(&check_bounds_stuff_works, &*buf); + } + + // Pre-warm page cache with the contents. + // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming + // benefits the code that writes InMemoryLayer=>L0 layers. + let nblocks = buflen / PAGE_SZ; + let nblocks32 = u32::try_from(nblocks).unwrap(); + let cache = page_cache::get(); + static CTX: Lazy = Lazy::new(|| { + RequestContext::new( + crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache, + crate::context::DownloadBehavior::Error, + ) + }); + for blknum_in_buffer in 0..nblocks { + let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ]; + let blknum = self + .nwritten_blocks + .checked_add(blknum_in_buffer as u32) + .unwrap(); + match cache + .read_immutable_buf(self.page_cache_file_id, blknum, &CTX) + .await + { + Err(e) => { + error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}"); + // fail gracefully, it's not the end of the world if we can't pre-warm the cache here + } + Ok(v) => match v { + page_cache::ReadBufResult::Found(_guard) => { + // This function takes &mut self, so, it shouldn't be possible to reach this point. + unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \ + and this function takes &mut self, so, no concurrent read_blk is possible"); + } + page_cache::ReadBufResult::NotFound(mut write_guard) => { + write_guard.copy_from_slice(blk_in_buffer); + let _ = write_guard.mark_valid(); + } + }, + } + } + self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap(); + Ok((buflen, buf.into_inner())) + } +} diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs new file mode 100644 index 0000000000..b37eafb52c --- /dev/null +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs @@ -0,0 +1,130 @@ +//! The heart of how [`super::EphemeralFile`] does its reads and writes. +//! +//! # Writes +//! +//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`]. +//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`]. +//! +//! # Reads +//! +//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`]. +//! +//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer +//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`] +//! if the read is for the prefix that has already been flushed. +//! +//! # Current Usage +//! +//! The current user of this module is [`super::page_caching::RW`]. + +mod zero_padded; + +use crate::{ + context::RequestContext, + page_cache::PAGE_SZ, + virtual_file::owned_buffers_io::{ + self, + write::{Buffer, OwnedAsyncWriter}, + }, +}; + +const TAIL_SZ: usize = 64 * 1024; + +/// See module-level comment. +pub struct RW { + buffered_writer: owned_buffers_io::write::BufferedWriter< + zero_padded::Buffer, + owned_buffers_io::util::size_tracking_writer::Writer, + >, +} + +pub enum ReadResult<'a, W> { + NeedsReadFromWriter { writer: &'a W }, + ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] }, +} + +impl RW +where + W: OwnedAsyncWriter, +{ + pub fn new(writer: W) -> Self { + let bytes_flushed_tracker = + owned_buffers_io::util::size_tracking_writer::Writer::new(writer); + let buffered_writer = owned_buffers_io::write::BufferedWriter::new( + bytes_flushed_tracker, + zero_padded::Buffer::default(), + ); + Self { buffered_writer } + } + + pub(crate) fn as_writer(&self) -> &W { + self.buffered_writer.as_inner().as_inner() + } + + pub async fn write_all_borrowed( + &mut self, + buf: &[u8], + ctx: &RequestContext, + ) -> std::io::Result { + self.buffered_writer.write_buffered_borrowed(buf, ctx).await + } + + pub fn bytes_written(&self) -> u64 { + let flushed_offset = self.buffered_writer.as_inner().bytes_written(); + let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); + flushed_offset + u64::try_from(buffer.pending()).unwrap() + } + + pub(crate) async fn read_blk(&self, blknum: u32) -> Result, std::io::Error> { + let flushed_offset = self.buffered_writer.as_inner().bytes_written(); + let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); + let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap(); + let read_offset = (blknum as u64) * (PAGE_SZ as u64); + + // The trailing page ("block") might only be partially filled, + // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway. + // Moreover, it has to be zero-padded, because when we still had + // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it. + // DeltaLayer probably has the same issue, not sure why it needs no special treatment. + // => check here that the read doesn't go beyond this potentially trailing + // => the zero-padding is done in the `else` branch below + let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 { + buffered_offset / (PAGE_SZ as u64) + } else { + (buffered_offset / (PAGE_SZ as u64)) + 1 + }; + if (blknum as u64) >= blocks_written { + return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}"))); + } + + // assertions for the `if-else` below + assert_eq!( + flushed_offset % (TAIL_SZ as u64), 0, + "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks" + ); + assert_eq!( + flushed_offset % (PAGE_SZ as u64), + 0, + "the logic below can't handle if the page is spread across the flushed part and the buffer" + ); + + if read_offset < flushed_offset { + assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset); + Ok(ReadResult::NeedsReadFromWriter { + writer: self.as_writer(), + }) + } else { + let read_offset_in_buffer = read_offset + .checked_sub(flushed_offset) + .expect("would have taken `if` branch instead of this one"); + let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap(); + let zero_padded_slice = buffer.as_zero_padded_slice(); + let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)]; + Ok(ReadResult::ServedFromZeroPaddedMutableTail { + buffer: page + .try_into() + .expect("the slice above got it as page-size slice"), + }) + } + } +} diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs new file mode 100644 index 0000000000..f90291bbf8 --- /dev/null +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs @@ -0,0 +1,108 @@ +//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose +//! unwritten range is guaranteed to be zero-initialized. +//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`] +//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled. + +use std::mem::MaybeUninit; + +/// See module-level comment. +pub struct Buffer { + allocation: Box<[u8; N]>, + written: usize, +} + +impl Default for Buffer { + fn default() -> Self { + Self { + allocation: Box::new( + // SAFETY: zeroed memory is a valid [u8; N] + unsafe { MaybeUninit::zeroed().assume_init() }, + ), + written: 0, + } + } +} + +impl Buffer { + #[inline(always)] + fn invariants(&self) { + // don't check by default, unoptimized is too expensive even for debug mode + if false { + debug_assert!(self.written <= N, "{}", self.written); + debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0)); + } + } + + pub fn as_zero_padded_slice(&self) -> &[u8; N] { + &self.allocation + } +} + +impl crate::virtual_file::owned_buffers_io::write::Buffer for Buffer { + type IoBuf = Self; + + fn cap(&self) -> usize { + self.allocation.len() + } + + fn extend_from_slice(&mut self, other: &[u8]) { + self.invariants(); + let remaining = self.allocation.len() - self.written; + if other.len() > remaining { + panic!("calling extend_from_slice() with insufficient remaining capacity"); + } + self.allocation[self.written..(self.written + other.len())].copy_from_slice(other); + self.written += other.len(); + self.invariants(); + } + + fn pending(&self) -> usize { + self.written + } + + fn flush(self) -> tokio_epoll_uring::Slice { + self.invariants(); + let written = self.written; + tokio_epoll_uring::BoundedBuf::slice(self, 0..written) + } + + fn reuse_after_flush(iobuf: Self::IoBuf) -> Self { + let Self { + mut allocation, + written, + } = iobuf; + allocation[0..written].fill(0); + let new = Self { + allocation, + written: 0, + }; + new.invariants(); + new + } +} + +/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a +/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data. +/// +/// Remember that bytes_init is generally _not_ a tracker of the amount +/// of valid data in the io buffer; we use `Slice` for that. +/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit. +/// +/// SAFETY: +/// +/// The [`Self::allocation`] is stable becauses boxes are stable. +/// The memory is zero-initialized, so, bytes_init is always N. +unsafe impl tokio_epoll_uring::IoBuf for Buffer { + fn stable_ptr(&self) -> *const u8 { + self.allocation.as_ptr() + } + + fn bytes_init(&self) -> usize { + // Yes, N, not self.written; Read the full comment of this impl block! + N + } + + fn bytes_total(&self) -> usize { + N + } +} diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 4c4cd90c99..2724a5cc07 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -588,7 +588,7 @@ impl LayerMap { let kr = Key::from_i128(current_key)..Key::from_i128(change_key); coverage.push((kr, current_val.take())); current_key = change_key; - current_val = change_val.clone(); + current_val.clone_from(&change_val); } // Add the final interval @@ -672,12 +672,12 @@ impl LayerMap { // Loop through the delta coverage and recurse on each part for (change_key, change_val) in version.delta_coverage.range(start..end) { // If there's a relevant delta in this part, add 1 and recurse down - if let Some(val) = current_val { + if let Some(val) = ¤t_val { if val.get_lsn_range().end > lsn.start { let kr = Key::from_i128(current_key)..Key::from_i128(change_key); let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = Self::is_reimage_worthy(&val, key) as usize; + let base_count = Self::is_reimage_worthy(val, key) as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit); max_stacked_deltas = std::cmp::max( @@ -689,17 +689,17 @@ impl LayerMap { } current_key = change_key; - current_val = change_val.clone(); + current_val.clone_from(&change_val); } // Consider the last part - if let Some(val) = current_val { + if let Some(val) = ¤t_val { if val.get_lsn_range().end > lsn.start { let kr = Key::from_i128(current_key)..Key::from_i128(end); let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = Self::is_reimage_worthy(&val, key) as usize; + let base_count = Self::is_reimage_worthy(val, key) as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit); max_stacked_deltas = std::cmp::max( @@ -916,6 +916,7 @@ mod tests { assert_eq!(lhs, rhs); } + #[cfg(test)] fn brute_force_range_search( layer_map: &LayerMap, key_range: Range, diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 1736950d1f..39da713479 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -235,6 +235,12 @@ impl TimelineMetadata { let bytes = instance.to_bytes().unwrap(); Self::from_bytes(&bytes).unwrap() } + + pub(crate) fn apply(&mut self, update: &MetadataUpdate) { + self.body.disk_consistent_lsn = update.disk_consistent_lsn; + self.body.prev_record_lsn = update.prev_record_lsn; + self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn; + } } impl<'de> Deserialize<'de> for TimelineMetadata { @@ -259,6 +265,27 @@ impl Serialize for TimelineMetadata { } } +/// Parts of the metadata which are regularly modified. +pub(crate) struct MetadataUpdate { + disk_consistent_lsn: Lsn, + prev_record_lsn: Option, + latest_gc_cutoff_lsn: Lsn, +} + +impl MetadataUpdate { + pub(crate) fn new( + disk_consistent_lsn: Lsn, + prev_record_lsn: Option, + latest_gc_cutoff_lsn: Lsn, + ) -> Self { + Self { + disk_consistent_lsn, + prev_record_lsn, + latest_gc_cutoff_lsn, + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index b1b46d487b..006d501daa 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2,6 +2,7 @@ //! page server. use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; +use futures::StreamExt; use itertools::Itertools; use pageserver_api::key::Key; use pageserver_api::models::LocationConfigMode; @@ -253,17 +254,15 @@ impl TenantsMap { } } +/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then +/// the slower actual deletion in the background. +/// /// This is "safe" in that that it won't leave behind a partially deleted directory /// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting /// the contents. /// /// This is pageserver-specific, as it relies on future processes after a crash to check /// for TEMP_FILE_SUFFIX when loading things. -async fn safe_remove_tenant_dir_all(path: impl AsRef) -> std::io::Result<()> { - let tmp_path = safe_rename_tenant_dir(path).await?; - fs::remove_dir_all(tmp_path).await -} - async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result { let parent = path .as_ref() @@ -286,6 +285,28 @@ async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result> = Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing)); @@ -570,7 +591,11 @@ pub async fn init_tenant_mgr( ); TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64); - // Construct `Tenant` objects and start them running + // Accumulate futures for writing tenant configs, so that we can execute in parallel + let mut config_write_futs = Vec::new(); + + // Update the location configs according to the re-attach response and persist them to disk + tracing::info!("Updating {} location configs", tenant_configs.len()); for (tenant_shard_id, location_conf) in tenant_configs { let tenant_dir_path = conf.tenant_path(&tenant_shard_id); @@ -597,18 +622,22 @@ pub async fn init_tenant_mgr( const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig = SecondaryLocationConfig { warm: true }; - // Update the location config according to the re-attach response if let Some(tenant_modes) = &tenant_modes { // We have a generation map: treat it as the authority for whether // this tenant is really attached. match tenant_modes.get(&tenant_shard_id) { None => { info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response"); - if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { - error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}", - ); - } + + match safe_rename_tenant_dir(&tenant_dir_path).await { + Ok(tmp_path) => { + spawn_background_purge(tmp_path); + } + Err(e) => { + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Failed to move detached tenant directory '{tenant_dir_path}': {e:?}"); + } + }; // We deleted local content: move on to next tenant, don't try and spawn this one. continue; @@ -654,8 +683,32 @@ pub async fn init_tenant_mgr( // Presence of a generation number implies attachment: attach the tenant // if it wasn't already, and apply the generation number. - Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; + config_write_futs.push(async move { + let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await; + (tenant_shard_id, location_conf, r) + }); + } + // Execute config writes with concurrency, to avoid bottlenecking on local FS write latency + tracing::info!( + "Writing {} location config files...", + config_write_futs.len() + ); + let config_write_results = futures::stream::iter(config_write_futs) + .buffer_unordered(16) + .collect::>() + .await; + + tracing::info!( + "Spawning {} tenant shard locations...", + config_write_results.len() + ); + // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running + for (tenant_shard_id, location_conf, config_write_result) in config_write_results { + // Errors writing configs are fatal + config_write_result?; + + let tenant_dir_path = conf.tenant_path(&tenant_shard_id); let shard_identity = location_conf.shard; let slot = match location_conf.mode { LocationMode::Attached(attached_conf) => { @@ -678,12 +731,19 @@ pub async fn init_tenant_mgr( } } } - LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new( - tenant_shard_id, - shard_identity, - location_conf.tenant_conf, - &secondary_conf, - )), + LocationMode::Secondary(secondary_conf) => { + info!( + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + "Starting secondary tenant" + ); + TenantSlot::Secondary(SecondaryTenant::new( + tenant_shard_id, + shard_identity, + location_conf.tenant_conf, + &secondary_conf, + )) + } }; tenants.insert(tenant_shard_id, slot); @@ -1410,9 +1470,15 @@ impl TenantManager { match tenant.current_state() { TenantState::Broken { .. } | TenantState::Stopping { .. } => { - // If a tenant is broken or stopping, DeleteTenantFlow can - // handle it: broken tenants proceed to delete, stopping tenants - // are checked for deletion already in progress. + // If deletion is already in progress, return success (the semantics of this + // function are to rerturn success afterr deletion is spawned in background). + // Otherwise fall through and let [`DeleteTenantFlow`] handle this state. + if DeleteTenantFlow::is_in_progress(&tenant) { + // The `delete_progress` lock is held: deletion is already happening + // in the bacckground + slot_guard.revert(); + return Ok(()); + } } _ => { tenant @@ -1686,7 +1752,7 @@ impl TenantManager { let tmp_path = safe_rename_tenant_dir(&local_tenant_directory) .await .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?; - self.spawn_background_purge(tmp_path); + spawn_background_purge(tmp_path); fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!( "failpoint" @@ -1841,28 +1907,6 @@ impl TenantManager { shutdown_all_tenants0(self.tenants).await } - /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in - /// the background, and thereby avoid blocking any API requests on this deletion completing. - fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) { - // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. - // After a tenant is detached, there are no more task_mgr tasks for that tenant_id. - let task_tenant_id = None; - - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::MgmtRequest, - task_tenant_id, - None, - "tenant_files_delete", - false, - async move { - fs::remove_dir_all(tmp_path.as_path()) - .await - .with_context(|| format!("tenant directory {:?} deletion", tmp_path)) - }, - ); - } - pub(crate) async fn detach_tenant( &self, conf: &'static PageServerConf, @@ -1879,7 +1923,7 @@ impl TenantManager { deletion_queue_client, ) .await?; - self.spawn_background_purge(tmp_path); + spawn_background_purge(tmp_path); Ok(()) } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 3879135f26..a54e93c96b 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -202,12 +202,15 @@ use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; use std::time::Duration; -use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel}; +use remote_storage::{ + DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel, +}; use std::ops::DerefMut; use tracing::{debug, error, info, instrument, warn}; use tracing::{info_span, Instrument}; use utils::lsn::Lsn; +use crate::context::RequestContext; use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::metrics::{ MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, @@ -236,11 +239,14 @@ use utils::id::{TenantId, TimelineId}; use self::index::IndexPart; +use super::metadata::MetadataUpdate; use super::storage_layer::{Layer, LayerFileName, ResidentLayer}; use super::upload_queue::SetDeletedFlagProgress; use super::Generation; -pub(crate) use download::{is_temp_download_file, list_remote_timelines}; +pub(crate) use download::{ + download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines, +}; pub(crate) use index::LayerFileMetadata; // Occasional network issues and such can cause remote operations to fail, and @@ -469,7 +475,7 @@ impl RemoteTimelineClient { }, ); - let index_part = download::download_index_part( + let (index_part, _index_generation) = download::download_index_part( &self.storage_impl, &self.tenant_shard_id, &self.timeline_id, @@ -500,6 +506,7 @@ impl RemoteTimelineClient { layer_file_name: &LayerFileName, layer_metadata: &LayerFileMetadata, cancel: &CancellationToken, + ctx: &RequestContext, ) -> anyhow::Result { let downloaded_size = { let _unfinished_gauge_guard = self.metrics.call_begin( @@ -517,6 +524,7 @@ impl RemoteTimelineClient { layer_file_name, layer_metadata, cancel, + ctx, ) .measure_remote_op( RemoteOpFileKind::Layer, @@ -536,9 +544,10 @@ impl RemoteTimelineClient { // Upload operations. // - /// /// Launch an index-file upload operation in the background, with - /// updated metadata. + /// fully updated metadata. + /// + /// This should only be used to upload initial metadata to remote storage. /// /// The upload will be added to the queue immediately, but it /// won't be performed until all previously scheduled layer file @@ -550,7 +559,7 @@ impl RemoteTimelineClient { /// If there were any changes to the list of files, i.e. if any /// layer file uploads were scheduled, since the last index file /// upload, those will be included too. - pub fn schedule_index_upload_for_metadata_update( + pub fn schedule_index_upload_for_full_metadata_update( self: &Arc, metadata: &TimelineMetadata, ) -> anyhow::Result<()> { @@ -566,6 +575,27 @@ impl RemoteTimelineClient { Ok(()) } + /// Launch an index-file upload operation in the background, with only parts of the metadata + /// updated. + /// + /// This is the regular way of updating metadata on layer flushes or Gc. + /// + /// Using this lighter update mechanism allows for reparenting and detaching without changes to + /// `index_part.json`, while being more clear on what values update regularly. + pub(crate) fn schedule_index_upload_for_metadata_update( + self: &Arc, + update: &MetadataUpdate, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + upload_queue.latest_metadata.apply(update); + + self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone()); + + Ok(()) + } + /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -1122,7 +1152,7 @@ impl RemoteTimelineClient { // and retry will arrive to different pageserver there wont be any traces of it on remote storage let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id); - // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't + // Execute all pending deletions, so that when we proceed to do a listing below, we aren't // taking the burden of listing all the layers that we already know we should delete. self.flush_deletion_queue().await?; @@ -1131,14 +1161,20 @@ impl RemoteTimelineClient { let remaining = download_retry( || async { self.storage_impl - .list_files(Some(&timeline_storage_path), None, &cancel) + .list( + Some(&timeline_storage_path), + ListingMode::NoDelimiter, + None, + &cancel, + ) .await }, "list remaining files", &cancel, ) .await - .context("list files remaining files")?; + .context("list files remaining files")? + .keys; // We will delete the current index_part object last, since it acts as a deletion // marker via its deleted_at attribute @@ -1685,6 +1721,11 @@ impl RemoteTimelineClient { } } +pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath { + let path = format!("tenants/{tenant_shard_id}"); + RemotePath::from_string(&path).expect("Failed to construct path") +} + pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath { let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}"); RemotePath::from_string(&path).expect("Failed to construct path") @@ -2024,7 +2065,7 @@ mod tests { // Schedule upload of index. Check that it is queued let metadata = dummy_metadata(Lsn(0x20)); client - .schedule_index_upload_for_metadata_update(&metadata) + .schedule_index_upload_for_full_metadata_update(&metadata) .unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 6ee8ad7155..b038f264f5 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -5,6 +5,7 @@ use std::collections::HashSet; use std::future::Future; +use std::str::FromStr; use anyhow::{anyhow, Context}; use camino::{Utf8Path, Utf8PathBuf}; @@ -17,6 +18,7 @@ use tracing::warn; use utils::backoff; use crate::config::PageServerConf; +use crate::context::RequestContext; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerFileName; @@ -25,13 +27,13 @@ use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath}; use utils::crashsafe::path_with_suffix_extension; -use utils::id::TimelineId; +use utils::id::{TenantId, TimelineId}; use super::index::{IndexPart, LayerFileMetadata}; use super::{ parse_remote_index_path, remote_index_path, remote_initdb_archive_path, - remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, - INITDB_PATH, + remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, INITDB_PATH, }; /// @@ -39,6 +41,7 @@ use super::{ /// in the metadata. (In the future, we might do more cross-checks, like CRC validation) /// /// Returns the size of the downloaded file. +#[allow(clippy::too_many_arguments)] pub async fn download_layer_file<'a>( conf: &'static PageServerConf, storage: &'a GenericRemoteStorage, @@ -47,6 +50,7 @@ pub async fn download_layer_file<'a>( layer_file_name: &'a LayerFileName, layer_metadata: &'a LayerFileMetadata, cancel: &CancellationToken, + ctx: &RequestContext, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -74,7 +78,7 @@ pub async fn download_layer_file<'a>( let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); let bytes_amount = download_retry( - || async { download_object(storage, &remote_path, &temp_file_path, cancel).await }, + || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await }, &format!("download {remote_path:?}"), cancel, ) @@ -132,6 +136,7 @@ async fn download_object<'a>( src_path: &RemotePath, dst_path: &Utf8PathBuf, cancel: &CancellationToken, + #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext, ) -> Result { let res = match crate::virtual_file::io_engine::get() { crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"), @@ -182,6 +187,7 @@ async fn download_object<'a>( #[cfg(target_os = "linux")] crate::virtual_file::io_engine::IoEngine::TokioEpollUring => { use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer}; + use bytes::BytesMut; async { let destination_file = VirtualFile::create(dst_path) .await @@ -194,10 +200,10 @@ async fn download_object<'a>( // There's chunks_vectored() on the stream. let (bytes_amount, destination_file) = async { let size_tracking = size_tracking_writer::Writer::new(destination_file); - let mut buffered = owned_buffers_io::write::BufferedWriter::< - { super::BUFFER_SIZE }, - _, - >::new(size_tracking); + let mut buffered = owned_buffers_io::write::BufferedWriter::::new( + size_tracking, + BytesMut::with_capacity(super::BUFFER_SIZE), + ); while let Some(res) = futures::StreamExt::next(&mut download.download_stream).await { @@ -206,10 +212,10 @@ async fn download_object<'a>( Err(e) => return Err(e), }; buffered - .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk)) + .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk), ctx) .await?; } - let size_tracking = buffered.flush_and_into_inner().await?; + let size_tracking = buffered.flush_and_into_inner(ctx).await?; Ok(size_tracking.into_inner()) } .await?; @@ -252,42 +258,31 @@ pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool { } } -/// List timelines of given tenant in remote storage -pub async fn list_remote_timelines( +async fn list_identifiers( storage: &GenericRemoteStorage, - tenant_shard_id: TenantShardId, + prefix: RemotePath, cancel: CancellationToken, -) -> anyhow::Result<(HashSet, HashSet)> { - let remote_path = remote_timelines_path(&tenant_shard_id); - - fail::fail_point!("storage-sync-list-remote-timelines", |_| { - anyhow::bail!("storage-sync-list-remote-timelines"); - }); - +) -> anyhow::Result<(HashSet, HashSet)> +where + T: FromStr + Eq + std::hash::Hash, +{ let listing = download_retry_forever( - || { - storage.list( - Some(&remote_path), - ListingMode::WithDelimiter, - None, - &cancel, - ) - }, - &format!("list timelines for {tenant_shard_id}"), + || storage.list(Some(&prefix), ListingMode::WithDelimiter, None, &cancel), + &format!("list identifiers in prefix {prefix}"), &cancel, ) .await?; - let mut timeline_ids = HashSet::new(); + let mut parsed_ids = HashSet::new(); let mut other_prefixes = HashSet::new(); - for timeline_remote_storage_key in listing.prefixes { - let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { - anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}") + for id_remote_storage_key in listing.prefixes { + let object_name = id_remote_storage_key.object_name().ok_or_else(|| { + anyhow::anyhow!("failed to get object name for key {id_remote_storage_key}") })?; - match object_name.parse::() { - Ok(t) => timeline_ids.insert(t), + match object_name.parse::() { + Ok(t) => parsed_ids.insert(t), Err(_) => other_prefixes.insert(object_name.to_string()), }; } @@ -299,7 +294,31 @@ pub async fn list_remote_timelines( other_prefixes.insert(object_name.to_string()); } - Ok((timeline_ids, other_prefixes)) + Ok((parsed_ids, other_prefixes)) +} + +/// List shards of given tenant in remote storage +pub(crate) async fn list_remote_tenant_shards( + storage: &GenericRemoteStorage, + tenant_id: TenantId, + cancel: CancellationToken, +) -> anyhow::Result<(HashSet, HashSet)> { + let remote_path = remote_tenant_path(&TenantShardId::unsharded(tenant_id)); + list_identifiers::(storage, remote_path, cancel).await +} + +/// List timelines of given tenant shard in remote storage +pub async fn list_remote_timelines( + storage: &GenericRemoteStorage, + tenant_shard_id: TenantShardId, + cancel: CancellationToken, +) -> anyhow::Result<(HashSet, HashSet)> { + fail::fail_point!("storage-sync-list-remote-timelines", |_| { + anyhow::bail!("storage-sync-list-remote-timelines"); + }); + + let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash(); + list_identifiers::(storage, remote_path, cancel).await } async fn do_download_index_part( @@ -308,7 +327,7 @@ async fn do_download_index_part( timeline_id: &TimelineId, index_generation: Generation, cancel: &CancellationToken, -) -> Result { +) -> Result<(IndexPart, Generation), DownloadError> { let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); let index_part_bytes = download_retry_forever( @@ -333,7 +352,7 @@ async fn do_download_index_part( .with_context(|| format!("deserialize index part file at {remote_path:?}")) .map_err(DownloadError::Other)?; - Ok(index_part) + Ok((index_part, index_generation)) } /// index_part.json objects are suffixed with a generation number, so we cannot @@ -342,13 +361,13 @@ async fn do_download_index_part( /// In this function we probe for the most recent index in a generation <= our current generation. /// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md #[tracing::instrument(skip_all, fields(generation=?my_generation))] -pub(super) async fn download_index_part( +pub(crate) async fn download_index_part( storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, my_generation: Generation, cancel: &CancellationToken, -) -> Result { +) -> Result<(IndexPart, Generation), DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); if my_generation.is_none() { @@ -417,11 +436,16 @@ pub(super) async fn download_index_part( let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none()); let indices = download_retry( - || async { storage.list_files(Some(&index_prefix), None, cancel).await }, + || async { + storage + .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel) + .await + }, "list index_part files", cancel, ) - .await?; + .await? + .keys; // General case logic for which index to use: the latest index whose generation // is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 137fe48b73..0227331953 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -167,7 +167,7 @@ pub(crate) async fn time_travel_recover_tenant( let warn_after = 3; let max_attempts = 10; let mut prefixes = Vec::with_capacity(2); - if tenant_shard_id.is_zero() { + if tenant_shard_id.is_shard_zero() { // Also recover the unsharded prefix for a shard of zero: // - if the tenant is totally unsharded, the unsharded prefix contains all the data // - if the tenant is sharded, we still want to recover the initdb data, but we only diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 19f36c722e..5c46df268a 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -7,6 +7,7 @@ use std::{sync::Arc, time::SystemTime}; use crate::{ config::PageServerConf, + context::RequestContext, disk_usage_eviction_task::DiskUsageEvictionInfo, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, virtual_file::MaybeFatalIo, @@ -316,9 +317,13 @@ pub fn spawn_tasks( let (upload_req_tx, upload_req_rx) = tokio::sync::mpsc::channel::>(16); + let downloader_task_ctx = RequestContext::new( + TaskKind::SecondaryDownloads, + crate::context::DownloadBehavior::Download, + ); task_mgr::spawn( BACKGROUND_RUNTIME.handle(), - TaskKind::SecondaryDownloads, + downloader_task_ctx.task_kind(), None, None, "secondary tenant downloads", @@ -330,6 +335,7 @@ pub fn spawn_tasks( download_req_rx, bg_jobs_clone, cancel_clone, + downloader_task_ctx, ) .await; diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 5b29c126d1..fb8907b5a8 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -8,6 +8,7 @@ use std::{ use crate::{ config::PageServerConf, + context::RequestContext, disk_usage_eviction_task::{ finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, }, @@ -30,7 +31,10 @@ use crate::{ use super::{ heatmap::HeatMapLayer, - scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs}, + scheduler::{ + self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult, + TenantBackgroundJobs, + }, SecondaryTenant, }; @@ -44,7 +48,6 @@ use chrono::format::{DelayedFormat, StrftimeItems}; use futures::Future; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; -use rand::Rng; use remote_storage::{DownloadError, Etag, GenericRemoteStorage}; use tokio_util::sync::CancellationToken; @@ -74,12 +77,14 @@ pub(super) async fn downloader_task( command_queue: tokio::sync::mpsc::Receiver>, background_jobs_can_start: Barrier, cancel: CancellationToken, + root_ctx: RequestContext, ) { let concurrency = tenant_manager.get_conf().secondary_download_concurrency; let generator = SecondaryDownloader { tenant_manager, remote_storage, + root_ctx, }; let mut scheduler = Scheduler::new(generator, concurrency); @@ -92,6 +97,7 @@ pub(super) async fn downloader_task( struct SecondaryDownloader { tenant_manager: Arc, remote_storage: GenericRemoteStorage, + root_ctx: RequestContext, } #[derive(Debug, Clone)] @@ -270,7 +276,7 @@ impl JobGenerator SchedulingResult { @@ -301,18 +307,16 @@ impl JobGenerator next_download { Some(PendingDownload { secondary_state: secondary_tenant, last_download, @@ -367,11 +371,12 @@ impl JobGenerator { @@ -485,7 +490,7 @@ impl<'a> TenantDownloader<'a> { } } - async fn download(&self) -> Result<(), UpdateError> { + async fn download(&self, ctx: &RequestContext) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_id(); // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure @@ -560,7 +565,7 @@ impl<'a> TenantDownloader<'a> { } let timeline_id = timeline.timeline_id; - self.download_timeline(timeline) + self.download_timeline(timeline, ctx) .instrument(tracing::info_span!( "secondary_download_timeline", tenant_id=%tenant_shard_id.tenant_id, @@ -647,6 +652,12 @@ impl<'a> TenantDownloader<'a> { progress.bytes_downloaded += layer_byte_count; progress.layers_downloaded += layer_count; } + + for delete_timeline in &delete_timelines { + // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal + // from disk fails that will be a fatal error. + detail.timelines.remove(delete_timeline); + } } // Execute accumulated deletions @@ -710,13 +721,14 @@ impl<'a> TenantDownloader<'a> { .await .map_err(UpdateError::from)?; + SECONDARY_MODE.download_heatmap.inc(); + if Some(&download.etag) == prev_etag { Ok(HeatMapDownload::Unmodified) } else { let mut heatmap_bytes = Vec::new(); let mut body = tokio_util::io::StreamReader::new(download.download_stream); let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?; - SECONDARY_MODE.download_heatmap.inc(); Ok(HeatMapDownload::Modified(HeatMapModified { etag: download.etag, last_modified: download.last_modified, @@ -735,7 +747,11 @@ impl<'a> TenantDownloader<'a> { .and_then(|x| x) } - async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> { + async fn download_timeline( + &self, + timeline: HeatMapTimeline, + ctx: &RequestContext, + ) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_and_timeline_id(); let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); let timeline_path = self @@ -868,6 +884,7 @@ impl<'a> TenantDownloader<'a> { &layer.name, &LayerFileMetadata::from(&layer.metadata), &self.secondary_state.cancel, + ctx, ) .await { diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index 39d088ffc3..352409f5fc 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -20,12 +20,14 @@ use crate::{ use futures::Future; use pageserver_api::shard::TenantShardId; -use rand::Rng; use remote_storage::{GenericRemoteStorage, TimeoutOrCancel}; use super::{ heatmap::HeatMapTenant, - scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs}, + scheduler::{ + self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult, + TenantBackgroundJobs, + }, CommandRequest, UploadCommand, }; use tokio_util::sync::CancellationToken; @@ -181,15 +183,11 @@ impl JobGenerator let state = self .tenants .entry(*tenant.get_tenant_shard_id()) - .or_insert_with(|| { - let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period); - - UploaderTenantState { - tenant: Arc::downgrade(&tenant), - last_upload: None, - next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)), - last_digest: None, - } + .or_insert_with(|| UploaderTenantState { + tenant: Arc::downgrade(&tenant), + last_upload: None, + next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)), + last_digest: None, }); // Decline to do the upload if insufficient time has passed @@ -274,7 +272,7 @@ impl JobGenerator let next_upload = tenant .get_heatmap_period() - .and_then(|period| now.checked_add(period)); + .and_then(|period| now.checked_add(period_jitter(period, 5))); WriteComplete { tenant_shard_id: *tenant.get_tenant_shard_id(), diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index 3bd7be782e..3d042f4513 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -1,4 +1,5 @@ use futures::Future; +use rand::Rng; use std::{ collections::HashMap, marker::PhantomData, @@ -19,6 +20,26 @@ use super::{CommandRequest, CommandResponse}; const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10); const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1); +/// Jitter a Duration by an integer percentage. Returned values are uniform +/// in the range 100-pct..100+pct (i.e. a 5% jitter is 5% either way: a ~10% range) +pub(super) fn period_jitter(d: Duration, pct: u32) -> Duration { + if d == Duration::ZERO { + d + } else { + rand::thread_rng().gen_range((d * (100 - pct)) / 100..(d * (100 + pct)) / 100) + } +} + +/// When a periodic task first starts, it should wait for some time in the range 0..period, so +/// that starting many such tasks at the same time spreads them across the time range. +pub(super) fn period_warmup(period: Duration) -> Duration { + if period == Duration::ZERO { + period + } else { + rand::thread_rng().gen_range(Duration::ZERO..period) + } +} + /// Scheduling helper for background work across many tenants. /// /// Systems that need to run background work across many tenants may use this type diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index ad79b74d8b..64fff5536c 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -189,7 +189,9 @@ pub(super) async fn gather_inputs( // than a space bound (horizon cutoff). This means that if someone drops a database and waits for their // PITR interval, they will see synthetic size decrease, even if we are still storing data inside // horizon_cutoff. - let mut next_gc_cutoff = gc_info.pitr_cutoff; + let pitr_cutoff = gc_info.cutoffs.pitr; + let horizon_cutoff = gc_info.cutoffs.horizon; + let mut next_gc_cutoff = pitr_cutoff; // If the caller provided a shorter retention period, use that instead of the GC cutoff. let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { @@ -216,6 +218,8 @@ pub(super) async fn gather_inputs( .map(|lsn| (lsn, LsnKind::BranchPoint)) .collect::>(); + drop(gc_info); + // Add branch points we collected earlier, just in case there were any that were // not present in retain_lsns. We will remove any duplicates below later. if let Some(this_branchpoints) = branchpoints.get(&timeline_id) { @@ -294,8 +298,8 @@ pub(super) async fn gather_inputs( last_record: last_record_lsn, // this is not used above, because it might not have updated recently enough latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), - horizon_cutoff: gc_info.horizon_cutoff, - pitr_cutoff: gc_info.pitr_cutoff, + horizon_cutoff, + pitr_cutoff, next_gc_cutoff, retention_param_cutoff, }); diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 9a2b086828..4f1b56ef9f 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -118,6 +118,7 @@ pub(crate) struct ValuesReconstructState { pub(crate) keys: HashMap>, keys_done: KeySpaceRandomAccum, + layers_visited: u32, } impl ValuesReconstructState { @@ -125,6 +126,7 @@ impl ValuesReconstructState { Self { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), + layers_visited: 0, } } @@ -138,6 +140,37 @@ impl ValuesReconstructState { } } + pub(crate) fn on_layer_visited(&mut self) { + self.layers_visited += 1; + } + + pub(crate) fn get_layers_visited(&self) -> u32 { + self.layers_visited + } + + /// This function is called after reading a keyspace from a layer. + /// It checks if the read path has now moved past the cached Lsn for any keys. + /// + /// Implementation note: We intentionally iterate over the keys for which we've + /// already collected some reconstruct data. This avoids scaling complexity with + /// the size of the search space. + pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) { + for (key, value) in self.keys.iter_mut() { + if !keyspace.contains(key) { + continue; + } + + if let Ok(state) = value { + if state.situation != ValueReconstructSituation::Complete + && state.get_cached_lsn() >= Some(advanced_to) + { + state.situation = ValueReconstructSituation::Complete; + self.keys_done.add_key(*key); + } + } + } + } + /// Update the state collected for a given key. /// Returns true if this was the last value needed for the key and false otherwise. /// @@ -162,11 +195,18 @@ impl ValuesReconstructState { true } Value::WalRecord(rec) => { - let reached_cache = - state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn); + debug_assert!( + Some(lsn) > state.get_cached_lsn(), + "Attempt to collect a record below cached LSN for walredo: {} < {}", + lsn, + state + .get_cached_lsn() + .expect("Assertion can only fire if a cached lsn is present") + ); + let will_init = rec.will_init(); state.records.push((lsn, rec)); - will_init || reached_cache + will_init } }, }; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 466d95f46d..b5538dff3a 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -20,8 +20,8 @@ //! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051 //! ``` //! -//! Every delta file consists of three parts: "summary", "index", and -//! "values". The summary is a fixed size header at the beginning of the file, +//! Every delta file consists of three parts: "summary", "values", and +//! "index". The summary is a fixed size header at the beginning of the file, //! and it contains basic information about the layer, and offsets to the other //! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the //! "values" part. The actual page images and WAL records are stored in the @@ -217,6 +217,7 @@ pub struct DeltaLayerInner { // values copied from summary index_start_blk: u32, index_root_blk: u32, + lsn_range: Range, file: VirtualFile, file_id: FileId, @@ -427,9 +428,15 @@ impl DeltaLayerWriterInner { /// /// The values must be appended in key, lsn order. /// - async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { + async fn put_value( + &mut self, + key: Key, + lsn: Lsn, + val: Value, + ctx: &RequestContext, + ) -> anyhow::Result<()> { let (_, res) = self - .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init()) + .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx) .await; res } @@ -440,9 +447,10 @@ impl DeltaLayerWriterInner { lsn: Lsn, val: Vec, will_init: bool, + ctx: &RequestContext, ) -> (Vec, anyhow::Result<()>) { assert!(self.lsn_range.start <= lsn); - let (val, res) = self.blob_writer.write_blob(val).await; + let (val, res) = self.blob_writer.write_blob(val, ctx).await; let off = match res { Ok(off) => off, Err(e) => return (val, Err(anyhow::anyhow!(e))), @@ -462,18 +470,23 @@ impl DeltaLayerWriterInner { /// /// Finish writing the delta layer. /// - async fn finish(self, key_end: Key, timeline: &Arc) -> anyhow::Result { + async fn finish( + self, + key_end: Key, + timeline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; - let mut file = self.blob_writer.into_inner().await?; + let mut file = self.blob_writer.into_inner(ctx).await?; // Write out the index let (index_root_blk, block_buf) = self.tree.finish()?; file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64)) .await?; for buf in block_buf.blocks { - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; } assert!(self.lsn_range.start < self.lsn_range.end); @@ -493,7 +506,7 @@ impl DeltaLayerWriterInner { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; let metadata = file @@ -591,8 +604,18 @@ impl DeltaLayerWriter { /// /// The values must be appended in key, lsn order. /// - pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { - self.inner.as_mut().unwrap().put_value(key, lsn, val).await + pub async fn put_value( + &mut self, + key: Key, + lsn: Lsn, + val: Value, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.inner + .as_mut() + .unwrap() + .put_value(key, lsn, val, ctx) + .await } pub async fn put_value_bytes( @@ -601,11 +624,12 @@ impl DeltaLayerWriter { lsn: Lsn, val: Vec, will_init: bool, + ctx: &RequestContext, ) -> (Vec, anyhow::Result<()>) { self.inner .as_mut() .unwrap() - .put_value_bytes(key, lsn, val, will_init) + .put_value_bytes(key, lsn, val, will_init, ctx) .await } @@ -620,10 +644,11 @@ impl DeltaLayerWriter { mut self, key_end: Key, timeline: &Arc, + ctx: &RequestContext, ) -> anyhow::Result { let inner = self.inner.take().unwrap(); let temp_path = inner.path.clone(); - let result = inner.finish(key_end, timeline).await; + let result = inner.finish(key_end, timeline, ctx).await; // The delta layer files can sometimes be really large. Clean them up. if result.is_err() { tracing::warn!( @@ -691,7 +716,7 @@ impl DeltaLayer { // TODO: could use smallvec here, but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; Ok(()) } @@ -728,6 +753,9 @@ impl DeltaLayerInner { // production code path expected_summary.index_start_blk = actual_summary.index_start_blk; expected_summary.index_root_blk = actual_summary.index_root_blk; + // mask out the timeline_id, but still require the layers to be from the same tenant + expected_summary.timeline_id = actual_summary.timeline_id; + if actual_summary != expected_summary { bail!( "in-file summary does not match expected summary. actual = {:?} expected = {:?}", @@ -742,6 +770,7 @@ impl DeltaLayerInner { file_id, index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, + lsn_range: actual_summary.lsn_range, max_vectored_read_bytes, })) } @@ -863,10 +892,10 @@ impl DeltaLayerInner { .into(), ); - let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64; + let data_end_offset = self.index_start_offset(); let reads = Self::plan_reads( - keyspace, + &keyspace, lsn_range, data_end_offset, index_reader, @@ -880,11 +909,13 @@ impl DeltaLayerInner { self.do_reads_and_update_state(reads, reconstruct_state) .await; + reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start); + Ok(()) } async fn plan_reads( - keyspace: KeySpace, + keyspace: &KeySpace, lsn_range: Range, data_end_offset: u64, index_reader: DiskBtreeReader, @@ -939,7 +970,7 @@ impl DeltaLayerInner { } if !range_end_handled { - tracing::info!("Handling range end fallback at {}", data_end_offset); + tracing::debug!("Handling range end fallback at {}", data_end_offset); planner.handle_range_end(data_end_offset); } } @@ -1103,11 +1134,201 @@ impl DeltaLayerInner { if let Some(last) = all_keys.last_mut() { // Last key occupies all space till end of value storage, // which corresponds to beginning of the index - last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size; + last.size = self.index_start_offset() - last.size; } Ok(all_keys) } + /// Using the given writer, write out a truncated version, where LSNs higher than the + /// truncate_at are missing. + #[cfg(test)] + pub(super) async fn copy_prefix( + &self, + writer: &mut DeltaLayerWriter, + truncate_at: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + use crate::tenant::vectored_blob_io::{ + BlobMeta, VectoredReadBuilder, VectoredReadExtended, + }; + use futures::stream::TryStreamExt; + + #[derive(Debug)] + enum Item { + Actual(Key, Lsn, BlobRef), + Sentinel, + } + + impl From for Option<(Key, Lsn, BlobRef)> { + fn from(value: Item) -> Self { + match value { + Item::Actual(key, lsn, blob) => Some((key, lsn, blob)), + Item::Sentinel => None, + } + } + } + + impl Item { + fn offset(&self) -> Option { + match self { + Item::Actual(_, _, blob) => Some(*blob), + Item::Sentinel => None, + } + } + + fn is_last(&self) -> bool { + matches!(self, Item::Sentinel) + } + } + + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); + + let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx); + let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos)); + // put in a sentinel value for getting the end offset for last item, and not having to + // repeat the whole read part + let stream = stream.chain(futures::stream::once(futures::future::ready(Ok( + Item::Sentinel, + )))); + let mut stream = std::pin::pin!(stream); + + let mut prev: Option<(Key, Lsn, BlobRef)> = None; + + let mut read_builder: Option = None; + + let max_read_size = self + .max_vectored_read_bytes + .map(|x| x.0.get()) + .unwrap_or(8192); + + let mut buffer = Some(BytesMut::with_capacity(max_read_size)); + + // FIXME: buffering of DeltaLayerWriter + let mut per_blob_copy = Vec::new(); + + while let Some(item) = stream.try_next().await? { + tracing::debug!(?item, "popped"); + let offset = item + .offset() + .unwrap_or(BlobRef::new(self.index_start_offset(), false)); + + let actionable = if let Some((key, lsn, start_offset)) = prev.take() { + let end_offset = offset; + + Some((BlobMeta { key, lsn }, start_offset..end_offset)) + } else { + None + }; + + let is_last = item.is_last(); + + prev = Option::from(item); + + let actionable = actionable.filter(|x| x.0.lsn < truncate_at); + + let builder = if let Some((meta, offsets)) = actionable { + // extend or create a new builder + if read_builder + .as_mut() + .map(|x| x.extend(offsets.start.pos(), offsets.end.pos(), meta)) + .unwrap_or(VectoredReadExtended::No) + == VectoredReadExtended::Yes + { + None + } else { + read_builder.replace(VectoredReadBuilder::new( + offsets.start.pos(), + offsets.end.pos(), + meta, + max_read_size, + )) + } + } else { + // nothing to do, except perhaps flush any existing for the last element + None + }; + + // flush the possible older builder and also the new one if the item was the last one + let builders = builder.into_iter(); + let builders = if is_last { + builders.chain(read_builder.take()) + } else { + builders.chain(None) + }; + + for builder in builders { + let read = builder.build(); + + let reader = VectoredBlobReader::new(&self.file); + + let mut buf = buffer.take().unwrap(); + + buf.clear(); + buf.reserve(read.size()); + let res = reader.read_blobs(&read, buf).await?; + + for blob in res.blobs { + let key = blob.meta.key; + let lsn = blob.meta.lsn; + let data = &res.buf[blob.start..blob.end]; + + #[cfg(debug_assertions)] + Value::des(data) + .with_context(|| { + format!( + "blob failed to deserialize for {}@{}, {}..{}: {:?}", + blob.meta.key, + blob.meta.lsn, + blob.start, + blob.end, + utils::Hex(data) + ) + }) + .unwrap(); + + // is it an image or will_init walrecord? + // FIXME: this could be handled by threading the BlobRef to the + // VectoredReadBuilder + let will_init = crate::repository::ValueBytes::will_init(data) + .inspect_err(|_e| { + #[cfg(feature = "testing")] + tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value"); + }) + .unwrap_or(false); + + per_blob_copy.clear(); + per_blob_copy.extend_from_slice(data); + + let (tmp, res) = writer + .put_value_bytes( + key, + lsn, + std::mem::take(&mut per_blob_copy), + will_init, + ctx, + ) + .await; + per_blob_copy = tmp; + res?; + } + + buffer = Some(res.buf); + } + } + + assert!( + read_builder.is_none(), + "with the sentinel above loop should had handled all" + ); + + Ok(()) + } + pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> { println!( "index_start_blk: {}, root {}", @@ -1177,6 +1398,44 @@ impl DeltaLayerInner { Ok(()) } + + #[cfg(test)] + fn stream_index_forwards<'a, R>( + &'a self, + reader: &'a DiskBtreeReader, + start: &'a [u8; DELTA_KEY_SIZE], + ctx: &'a RequestContext, + ) -> impl futures::stream::Stream< + Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>, + > + 'a + where + R: BlockReader, + { + use futures::stream::TryStreamExt; + let stream = reader.get_stream_from(start, ctx); + stream.map_ok(|(key, value)| { + let key = DeltaKey::from_slice(&key); + let (key, lsn) = (key.key(), key.lsn()); + let offset = BlobRef(value); + + (key, lsn, offset) + }) + } + + /// The file offset to the first block of index. + /// + /// The file structure is summary, values, and index. We often need this for the size of last blob. + fn index_start_offset(&self) -> u64 { + let offset = self.index_start_blk as u64 * PAGE_SZ as u64; + let bref = BlobRef(offset); + tracing::debug!( + index_start_blk = self.index_start_blk, + offset, + pos = bref.pos(), + "index_start_offset" + ); + offset + } } /// A set of data associated with a delta layer key and its value @@ -1310,7 +1569,7 @@ mod test { // Plan and validate let vectored_reads = DeltaLayerInner::plan_reads( - keyspace.clone(), + &keyspace, lsn_range.clone(), disk_offset, reader, @@ -1531,14 +1790,16 @@ mod test { for entry in entries { let (_, res) = writer - .put_value_bytes(entry.key, entry.lsn, entry.value, false) + .put_value_bytes(entry.key, entry.lsn, entry.value, false, &ctx) .await; res?; } - let resident = writer.finish(entries_meta.key_range.end, &timeline).await?; + let resident = writer + .finish(entries_meta.key_range.end, &timeline, &ctx) + .await?; - let inner = resident.get_inner_delta(&ctx).await?; + let inner = resident.as_delta(&ctx).await?; let file_size = inner.file.metadata().await?.len(); tracing::info!( @@ -1562,7 +1823,7 @@ mod test { let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64; let vectored_reads = DeltaLayerInner::plan_reads( - keyspace.clone(), + &keyspace, entries_meta.lsn_range.clone(), data_end_offset, index_reader, @@ -1594,4 +1855,217 @@ mod test { Ok(()) } + + #[tokio::test] + async fn copy_delta_prefix_smoke() { + use crate::walrecord::NeonWalRecord; + use bytes::Bytes; + + let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap(); + let (tenant, ctx) = h.load().await; + let ctx = &ctx; + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx) + .await + .unwrap(); + + let initdb_layer = timeline + .layers + .read() + .await + .likely_resident_layers() + .next() + .unwrap(); + + { + let mut writer = timeline.writer().await; + + let data = [ + (0x20, 12, Value::Image(Bytes::from_static(b"foobar"))), + ( + 0x30, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: false, + rec: Bytes::from_static(b"1"), + }), + ), + ( + 0x40, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: true, + rec: Bytes::from_static(b"2"), + }), + ), + // build an oversized value so we cannot extend and existing read over + // this + ( + 0x50, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: true, + rec: { + let mut buf = + vec![0u8; tenant.conf.max_vectored_read_bytes.0.get() + 1024]; + buf.iter_mut() + .enumerate() + .for_each(|(i, slot)| *slot = (i % 256) as u8); + Bytes::from(buf) + }, + }), + ), + // because the oversized read cannot be extended further, we are sure to exercise the + // builder created on the last round with this: + ( + 0x60, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: true, + rec: Bytes::from_static(b"3"), + }), + ), + ( + 0x60, + 9, + Value::Image(Bytes::from_static(b"something for a different key")), + ), + ]; + + let mut last_lsn = None; + + for (lsn, key, value) in data { + let key = Key::from_i128(key); + writer.put(key, Lsn(lsn), &value, ctx).await.unwrap(); + last_lsn = Some(lsn); + } + + writer.finish_write(Lsn(last_lsn.unwrap())); + } + timeline.freeze_and_flush().await.unwrap(); + + let new_layer = timeline + .layers + .read() + .await + .likely_resident_layers() + .find(|x| x != &initdb_layer) + .unwrap(); + + // create a copy for the timeline, so we don't overwrite the file + let branch = tenant + .branch_timeline_test(&timeline, TimelineId::generate(), None, ctx) + .await + .unwrap(); + + assert_eq!(branch.get_ancestor_lsn(), Lsn(0x60)); + + // truncating at 0x61 gives us a full copy, otherwise just go backwards until there's just + // a single key + + for truncate_at in [0x61, 0x51, 0x41, 0x31, 0x21] { + let truncate_at = Lsn(truncate_at); + + let mut writer = DeltaLayerWriter::new( + tenant.conf, + branch.timeline_id, + tenant.tenant_shard_id, + Key::MIN, + Lsn(0x11)..truncate_at, + ) + .await + .unwrap(); + + let new_layer = new_layer.download_and_keep_resident().await.unwrap(); + + new_layer + .copy_delta_prefix(&mut writer, truncate_at, ctx) + .await + .unwrap(); + + let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap(); + + copied_layer.as_delta(ctx).await.unwrap(); + + assert_keys_and_values_eq( + new_layer.as_delta(ctx).await.unwrap(), + copied_layer.as_delta(ctx).await.unwrap(), + truncate_at, + ctx, + ) + .await; + } + } + + async fn assert_keys_and_values_eq( + source: &DeltaLayerInner, + truncated: &DeltaLayerInner, + truncated_at: Lsn, + ctx: &RequestContext, + ) { + use futures::future::ready; + use futures::stream::TryStreamExt; + + let start_key = [0u8; DELTA_KEY_SIZE]; + + let source_reader = FileBlockReader::new(&source.file, source.file_id); + let source_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + source.index_start_blk, + source.index_root_blk, + &source_reader, + ); + let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx); + let source_stream = source_stream.filter(|res| match res { + Ok((_, lsn, _)) => ready(lsn < &truncated_at), + _ => ready(true), + }); + let mut source_stream = std::pin::pin!(source_stream); + + let truncated_reader = FileBlockReader::new(&truncated.file, truncated.file_id); + let truncated_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + truncated.index_start_blk, + truncated.index_root_blk, + &truncated_reader, + ); + let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx); + let mut truncated_stream = std::pin::pin!(truncated_stream); + + let mut scratch_left = Vec::new(); + let mut scratch_right = Vec::new(); + + loop { + let (src, truncated) = (source_stream.try_next(), truncated_stream.try_next()); + let (src, truncated) = tokio::try_join!(src, truncated).unwrap(); + + if src.is_none() { + assert!(truncated.is_none()); + break; + } + + let (src, truncated) = (src.unwrap(), truncated.unwrap()); + + // because we've filtered the source with Lsn, we should always have the same keys from both. + assert_eq!(src.0, truncated.0); + assert_eq!(src.1, truncated.1); + + // if this is needed for something else, just drop this assert. + assert!( + src.2.pos() >= truncated.2.pos(), + "value position should not go backwards {} vs. {}", + src.2.pos(), + truncated.2.pos() + ); + + scratch_left.clear(); + let src_cursor = source_reader.block_cursor(); + let left = src_cursor.read_blob_into_buf(src.2.pos(), &mut scratch_left, ctx); + scratch_right.clear(); + let trunc_cursor = truncated_reader.block_cursor(); + let right = trunc_cursor.read_blob_into_buf(truncated.2.pos(), &mut scratch_right, ctx); + + tokio::try_join!(left, right).unwrap(); + + assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right)); + } + } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 5b44d2bc2c..1477a1fc33 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -357,7 +357,7 @@ impl ImageLayer { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; Ok(()) } @@ -396,6 +396,8 @@ impl ImageLayerInner { // production code path expected_summary.index_start_blk = actual_summary.index_start_blk; expected_summary.index_root_blk = actual_summary.index_root_blk; + // mask out the timeline_id, but still require the layers to be from the same tenant + expected_summary.timeline_id = actual_summary.timeline_id; if actual_summary != expected_summary { bail!( @@ -675,9 +677,14 @@ impl ImageLayerWriterInner { /// /// The page versions must be appended in blknum order. /// - async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> { + async fn put_image( + &mut self, + key: Key, + img: Bytes, + ctx: &RequestContext, + ) -> anyhow::Result<()> { ensure!(self.key_range.contains(&key)); - let (_img, res) = self.blob_writer.write_blob(img).await; + let (_img, res) = self.blob_writer.write_blob(img, ctx).await; // TODO: re-use the buffer for `img` further upstack let off = res?; @@ -691,7 +698,11 @@ impl ImageLayerWriterInner { /// /// Finish writing the image layer. /// - async fn finish(self, timeline: &Arc) -> anyhow::Result { + async fn finish( + self, + timeline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; @@ -702,7 +713,7 @@ impl ImageLayerWriterInner { .await?; let (index_root_blk, block_buf) = self.tree.finish()?; for buf in block_buf.blocks { - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; } @@ -722,7 +733,7 @@ impl ImageLayerWriterInner { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; let metadata = file @@ -804,8 +815,13 @@ impl ImageLayerWriter { /// /// The page versions must be appended in blknum order. /// - pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> { - self.inner.as_mut().unwrap().put_image(key, img).await + pub async fn put_image( + &mut self, + key: Key, + img: Bytes, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.inner.as_mut().unwrap().put_image(key, img, ctx).await } /// @@ -814,8 +830,9 @@ impl ImageLayerWriter { pub(crate) async fn finish( mut self, timeline: &Arc, + ctx: &RequestContext, ) -> anyhow::Result { - self.inner.take().unwrap().finish(timeline).await + self.inner.take().unwrap().finish(timeline, ctx).await } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 43942ba2db..4dacbec2f3 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -17,15 +17,16 @@ use anyhow::{anyhow, ensure, Result}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use std::collections::{BinaryHeap, HashMap, HashSet}; +use std::collections::{BTreeMap, BinaryHeap, HashSet}; use std::sync::{Arc, OnceLock}; +use std::time::Instant; use tracing::*; use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods use crate::metrics::TIMELINE_EPHEMERAL_BYTES; use std::cmp::Ordering; -use std::fmt::Write as _; +use std::fmt::Write; use std::ops::Range; use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::atomic::{AtomicU64, AtomicUsize}; @@ -53,6 +54,14 @@ pub struct InMemoryLayer { /// Writes are only allowed when this is `None`. end_lsn: OnceLock, + /// Used for traversal path. Cached representation of the in-memory layer before frozen. + local_path_str: Arc, + + /// Used for traversal path. Cached representation of the in-memory layer after frozen. + frozen_local_path_str: OnceLock>, + + opened_at: Instant, + /// The above fields never change, except for `end_lsn`, which is only set once. /// All other changing parts are in `inner`, and protected by a mutex. inner: RwLock, @@ -69,10 +78,10 @@ impl std::fmt::Debug for InMemoryLayer { } pub struct InMemoryLayerInner { - /// All versions of all pages in the layer are kept here. Indexed + /// All versions of all pages in the layer are kept here. Indexed /// by block number and LSN. The value is an offset into the /// ephemeral file where the page version is stored. - index: HashMap>, + index: BTreeMap>, /// The values are stored in a serialized format in this file. /// Each serialized Value is preceded by a 'u32' length field. @@ -238,6 +247,12 @@ impl InMemoryLayer { self.start_lsn..self.end_lsn_or_max() } + pub(crate) fn local_path_str(&self) -> &Arc { + self.frozen_local_path_str + .get() + .unwrap_or(&self.local_path_str) + } + /// debugging function to print out the contents of the layer /// /// this is likely completly unused @@ -369,29 +384,24 @@ impl InMemoryLayer { let mut planned_block_reads = BinaryHeap::new(); for range in keyspace.ranges.iter() { - let mut key = range.start; - while key < range.end { - if let Some(vec_map) = inner.index.get(&key) { - let lsn_range = match reconstruct_state.get_cached_lsn(&key) { - Some(cached_lsn) => (cached_lsn + 1)..end_lsn, - None => self.start_lsn..end_lsn, - }; + for (key, vec_map) in inner.index.range(range.start..range.end) { + let lsn_range = match reconstruct_state.get_cached_lsn(key) { + Some(cached_lsn) => (cached_lsn + 1)..end_lsn, + None => self.start_lsn..end_lsn, + }; - let slice = vec_map.slice_range(lsn_range); - for (entry_lsn, pos) in slice.iter().rev() { - planned_block_reads.push(BlockRead { - key, - lsn: *entry_lsn, - block_offset: *pos, - }); - } + let slice = vec_map.slice_range(lsn_range); + for (entry_lsn, pos) in slice.iter().rev() { + planned_block_reads.push(BlockRead { + key: *key, + lsn: *entry_lsn, + block_offset: *pos, + }); } - - key = key.next(); } } - let keyspace_size = keyspace.total_size(); + let keyspace_size = keyspace.total_raw_size(); let mut completed_keys = HashSet::new(); while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() { @@ -423,14 +433,30 @@ impl InMemoryLayer { } } + reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn); + Ok(()) } } +fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result { + write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0) +} + +fn inmem_layer_log_display( + mut f: impl Write, + timeline: TimelineId, + start_lsn: Lsn, + end_lsn: Lsn, +) -> std::fmt::Result { + write!(f, "timeline {} in-memory ", timeline)?; + inmem_layer_display(f, start_lsn, end_lsn) +} + impl std::fmt::Display for InMemoryLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let end_lsn = self.end_lsn_or_max(); - write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0) + inmem_layer_display(f, self.start_lsn, end_lsn) } } @@ -451,17 +477,24 @@ impl InMemoryLayer { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?; - let key = InMemoryLayerFileId(file.id()); + let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { file_id: key, + local_path_str: { + let mut buf = String::new(); + inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap(); + buf.into() + }, + frozen_local_path_str: OnceLock::new(), conf, timeline_id, tenant_shard_id, start_lsn, end_lsn: OnceLock::new(), + opened_at: Instant::now(), inner: RwLock::new(InMemoryLayerInner { - index: HashMap::new(), + index: BTreeMap::new(), file, resource_units: GlobalResourceUnits::new(), }), @@ -520,6 +553,10 @@ impl InMemoryLayer { Ok(()) } + pub(crate) fn get_opened_at(&self) -> Instant { + self.opened_at + } + pub(crate) async fn tick(&self) -> Option { let mut inner = self.inner.write().await; let size = inner.file.len(); @@ -544,6 +581,15 @@ impl InMemoryLayer { ); self.end_lsn.set(end_lsn).expect("end_lsn set only once"); + self.frozen_local_path_str + .set({ + let mut buf = String::new(); + inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn) + .unwrap(); + buf.into() + }) + .expect("frozen_local_path_str set only once"); + for vec_map in inner.index.values() { for (lsn, _pos) in vec_map.as_slice() { assert!(*lsn < end_lsn); @@ -551,14 +597,17 @@ impl InMemoryLayer { } } - /// Write this frozen in-memory layer to disk. + /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta + /// layer will only contain the key range the user specifies, and may return `None` + /// if there are no matching keys. /// /// Returns a new delta layer with all the same data as this in-memory layer pub(crate) async fn write_to_disk( &self, timeline: &Arc, ctx: &RequestContext, - ) -> Result { + key_range: Option>, + ) -> Result> { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the // write lock on it, so we shouldn't block anyone. There's one exception @@ -572,6 +621,21 @@ impl InMemoryLayer { let end_lsn = *self.end_lsn.get().unwrap(); + let keys: Vec<_> = if let Some(key_range) = key_range { + inner + .index + .iter() + .filter(|(k, _)| key_range.contains(k)) + .map(|(k, m)| (k.to_i128(), m)) + .collect() + } else { + inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect() + }; + + if keys.is_empty() { + return Ok(None); + } + let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, @@ -585,33 +649,24 @@ impl InMemoryLayer { let cursor = inner.file.block_cursor(); - // Sort the keys because delta layer writer expects them sorted. - // - // NOTE: this sort can take up significant time if the layer has millions of - // keys. To speed up all the comparisons we convert the key to i128 and - // keep the value as a reference. - let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect(); - keys.sort_unstable_by_key(|k| k.0); - let ctx = RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::InMemoryLayer) .build(); - for (key, vec_map) in keys.iter() { - let key = Key::from_i128(*key); + for (key, vec_map) in inner.index.iter() { // Write all page versions for (lsn, pos) in vec_map.as_slice() { cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; let will_init = Value::des(&buf)?.will_init(); let res; (buf, res) = delta_layer_writer - .put_value_bytes(key, *lsn, buf, will_init) + .put_value_bytes(*key, *lsn, buf, will_init, &ctx) .await; res?; } } // MAX is used here because we identify L0 layers by full key range - let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?; - Ok(delta_layer) + let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?; + Ok(Some(delta_layer)) } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 27e60f783c..ebc0cbf9a4 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -14,9 +14,10 @@ use utils::lsn::Lsn; use utils::sync::heavier_once_cell; use crate::config::PageServerConf; -use crate::context::RequestContext; +use crate::context::{DownloadBehavior, RequestContext}; use crate::repository::Key; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::task_mgr::TaskKind; use crate::tenant::timeline::GetVectoredError; use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; @@ -116,6 +117,12 @@ impl AsLayerDesc for Layer { } } +impl PartialEq for Layer { + fn eq(&self, other: &Self) -> bool { + Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0) + } +} + impl Layer { /// Creates a layer value for a file we know to not be resident. pub(crate) fn for_evicted( @@ -330,6 +337,12 @@ impl Layer { .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self)) .await + .map_err(|err| match err { + GetVectoredError::Other(err) => GetVectoredError::Other( + err.context(format!("get_values_reconstruct_data for layer {self}")), + ), + err => err, + }) } /// Download the layer if evicted. @@ -389,6 +402,10 @@ impl Layer { &self.0.path } + pub(crate) fn debug_str(&self) -> &Arc { + &self.0.debug_str + } + pub(crate) fn metadata(&self) -> LayerFileMetadata { self.0.metadata() } @@ -511,6 +528,9 @@ struct LayerInner { /// Full path to the file; unclear if this should exist anymore. path: Utf8PathBuf, + /// String representation of the layer, used for traversal id. + debug_str: Arc, + desc: PersistentLayerDesc, /// Timeline access is needed for remote timeline client and metrics. @@ -604,9 +624,17 @@ enum Status { impl Drop for LayerInner { fn drop(&mut self) { + // if there was a pending eviction, mark it cancelled here to balance metrics + if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit() + { + // eviction has already been started + LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); + + // eviction request is intentionally not honored as no one is present to wait for it + // and we could be delaying shutdown for nothing. + } + if !*self.wanted_deleted.get_mut() { - // should we try to evict if the last wish was for eviction? seems more like a hazard - // than a clear win. return; } @@ -708,6 +736,7 @@ impl LayerInner { LayerInner { conf, + debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() }, path, desc, timeline: Arc::downgrade(timeline), @@ -911,11 +940,20 @@ impl LayerInner { return Err(DownloadError::DownloadRequired); } + let download_ctx = ctx + .map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download)) + .unwrap_or(RequestContext::new( + TaskKind::LayerDownload, + DownloadBehavior::Download, + )); + async move { tracing::info!(%reason, "downloading on-demand"); let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); - let res = self.download_init_and_wait(timeline, permit).await?; + let res = self + .download_init_and_wait(timeline, permit, download_ctx) + .await?; scopeguard::ScopeGuard::into_inner(init_cancelled); Ok(res) } @@ -954,6 +992,7 @@ impl LayerInner { self: &Arc, timeline: Arc, permit: heavier_once_cell::InitPermit, + ctx: RequestContext, ) -> Result, DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -983,7 +1022,7 @@ impl LayerInner { .await .unwrap(); - let res = this.download_and_init(timeline, permit).await; + let res = this.download_and_init(timeline, permit, &ctx).await; if let Err(res) = tx.send(res) { match res { @@ -1026,6 +1065,7 @@ impl LayerInner { self: &Arc, timeline: Arc, permit: heavier_once_cell::InitPermit, + ctx: &RequestContext, ) -> anyhow::Result> { let client = timeline .remote_client @@ -1033,7 +1073,12 @@ impl LayerInner { .expect("checked before download_init_and_wait"); let result = client - .download_layer_file(&self.desc.filename(), &self.metadata(), &timeline.cancel) + .download_layer_file( + &self.desc.filename(), + &self.metadata(), + &timeline.cancel, + ctx, + ) .await; match result { @@ -1552,8 +1597,8 @@ impl Drop for DownloadedLayer { if let Some(owner) = self.owner.upgrade() { owner.on_downloaded_layer_drop(self.version); } else { - // no need to do anything, we are shutting down - LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); + // Layer::drop will handle cancelling the eviction; because of drop order and + // `DownloadedLayer` never leaking, we cannot know here if eviction was requested. } } } @@ -1752,6 +1797,28 @@ impl ResidentLayer { } } + /// FIXME: truncate is bad name because we are not truncating anything, but copying the + /// filtered parts. + #[cfg(test)] + pub(super) async fn copy_delta_prefix( + &self, + writer: &mut super::delta_layer::DeltaLayerWriter, + truncate_at: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + use LayerKind::*; + + let owner = &self.owner.0; + + match self.downloaded.get(owner, ctx).await? { + Delta(ref d) => d + .copy_prefix(writer, truncate_at, ctx) + .await + .with_context(|| format!("truncate {self}")), + Image(_) => anyhow::bail!(format!("cannot truncate image layer {self}")), + } + } + pub(crate) fn local_path(&self) -> &Utf8Path { &self.owner.0.path } @@ -1761,14 +1828,14 @@ impl ResidentLayer { } #[cfg(test)] - pub(crate) async fn get_inner_delta<'a>( - &'a self, + pub(crate) async fn as_delta( + &self, ctx: &RequestContext, - ) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> { - let owner = &self.owner.0; - match self.downloaded.get(owner, ctx).await? { - LayerKind::Delta(d) => Ok(d), - LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")), + ) -> anyhow::Result<&delta_layer::DeltaLayerInner> { + use LayerKind::*; + match self.downloaded.get(&self.owner.0, ctx).await? { + Delta(ref d) => Ok(d), + Image(_) => Err(anyhow::anyhow!("image layer")), } } } diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 247ff123b5..52f62faa8d 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -721,11 +721,110 @@ async fn evict_and_wait_does_not_wait_for_download() { layer.evict_and_wait(FOREVER).await.unwrap(); } +/// Asserts that there is no miscalculation when Layer is dropped while it is being kept resident, +/// which is the last value. +/// +/// Also checks that the same does not happen on a non-evicted layer (regression test). +#[tokio::test(start_paused = true)] +async fn eviction_cancellation_on_drop() { + use crate::repository::Value; + use bytes::Bytes; + + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + + let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap(); + utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let (tenant, ctx) = h.load().await; + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + { + // create_test_timeline wrote us one layer, write another + let mut writer = timeline.writer().await; + writer + .put( + Key::from_i128(5), + Lsn(0x20), + &Value::Image(Bytes::from_static(b"this does not matter either")), + &ctx, + ) + .await + .unwrap(); + + writer.finish_write(Lsn(0x20)); + } + + timeline.freeze_and_flush().await.unwrap(); + + // wait for the upload to complete so our Arc::strong_count assertion holds + timeline + .remote_client + .as_ref() + .unwrap() + .wait_completion() + .await + .unwrap(); + + let (evicted_layer, not_evicted) = { + let mut layers = { + let mut guard = timeline.layers.write().await; + let layers = guard.likely_resident_layers().collect::>(); + // remove the layers from layermap + guard.finish_gc_timeline(&layers); + + layers + }; + + assert_eq!(layers.len(), 2); + + (layers.pop().unwrap(), layers.pop().unwrap()) + }; + + let victims = [(evicted_layer, true), (not_evicted, false)]; + + for (victim, evict) in victims { + let resident = victim.keep_resident().await.unwrap(); + drop(victim); + + assert_eq!(Arc::strong_count(&resident.owner.0), 1); + + if evict { + let evict_and_wait = resident.owner.evict_and_wait(FOREVER); + + // drive the future to await on the status channel, and then drop it + tokio::time::timeout(ADVANCE, evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + } + + // 1 == we only evict one of the layers + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + + drop(resident); + + // run any spawned + tokio::time::sleep(ADVANCE).await; + + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; + + assert_eq!( + 1, + LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get() + ); + } +} + +/// A test case to remind you the cost of these structures. You can bump the size limit +/// below if it is really necessary to add more fields to the structures. #[test] fn layer_size() { assert_eq!(std::mem::size_of::(), 2040); assert_eq!(std::mem::size_of::(), 104); - assert_eq!(std::mem::size_of::(), 2328); + assert_eq!(std::mem::size_of::(), 2344); // it also has the utf8 path } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 74ed677ffe..41b77c1f4a 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -62,7 +62,7 @@ impl BackgroundLoopKind { pub(crate) async fn concurrent_background_tasks_rate_limit_permit( loop_kind: BackgroundLoopKind, _ctx: &RequestContext, -) -> impl Drop { +) -> tokio::sync::SemaphorePermit<'static> { let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE .with_label_values(&[loop_kind.as_static_str()]) .guard(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d3c8c5f66c..c7a5598cec 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -16,14 +16,17 @@ use enumset::EnumSet; use fail::fail_point; use once_cell::sync::Lazy; use pageserver_api::{ - key::AUX_FILES_KEY, - keyspace::KeySpaceAccum, + key::{ + AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, + NON_INHERITED_SPARSE_RANGE, + }, + keyspace::{KeySpaceAccum, SparseKeyPartitioning}, models::{ CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState, }, reltag::BlockNumber, - shard::{ShardIdentity, TenantShardId}, + shard::{ShardIdentity, ShardNumber, TenantShardId}, }; use rand::Rng; use serde_with::serde_as; @@ -55,8 +58,6 @@ use std::{ ops::ControlFlow, }; -use crate::deletion_queue::DeletionQueueClient; -use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, @@ -66,6 +67,7 @@ use crate::{ disk_usage_eviction_task::DiskUsageEvictionInfo, pgdatadir_mapping::CollectKeySpaceError, }; +use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind}; use crate::{ disk_usage_eviction_task::finite_f32, tenant::storage_layer::{ @@ -77,6 +79,9 @@ use crate::{ use crate::{ disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, }; +use crate::{ + metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, +}; use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; use crate::{ pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, @@ -119,8 +124,8 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::config::TenantConf; use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline}; +use super::{config::TenantConf, storage_layer::VectoredValueReconstructState}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer}; @@ -137,6 +142,25 @@ pub(super) enum FlushLoopState { Exited, } +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum ImageLayerCreationMode { + /// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path. + Try, + /// Force creating the image layers if possible. For now, no image layers will be created + /// for metadata keys. Used in compaction code path with force flag enabled. + Force, + /// Initial ingestion of the data, and no data should be dropped in this function. This + /// means that no metadata keys should be included in the partitions. Used in flush frozen layer + /// code path. + Initial, +} + +impl std::fmt::Display for ImageLayerCreationMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self) + } +} + /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct Hole { @@ -182,6 +206,16 @@ pub(crate) struct AuxFilesState { pub(crate) n_deltas: usize, } +/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL +/// ingestion considerably, because WAL ingestion needs to check on most records if the record +/// implicitly extends the relation. At startup, `complete_as_of` is initialized to the current end +/// of the timeline (disk_consistent_lsn). It's used on reads of relation sizes to check if the +/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`]. +pub(crate) struct RelSizeCache { + pub(crate) complete_as_of: Lsn, + pub(crate) map: HashMap, +} + pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, @@ -296,7 +330,7 @@ pub struct Timeline { // List of child timelines and their branch points. This is needed to avoid // garbage collecting data that is still needed by the child timelines. - pub gc_info: std::sync::RwLock, + pub(crate) gc_info: std::sync::RwLock, // It may change across major versions so for simplicity // keep it after running initdb for a timeline. @@ -307,7 +341,7 @@ pub struct Timeline { pub initdb_lsn: Lsn, /// When did we last calculate the partitioning? - partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>, + partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, @@ -324,7 +358,7 @@ pub struct Timeline { pub walreceiver: Mutex>, /// Relation size cache - pub rel_size_cache: RwLock>, + pub(crate) rel_size_cache: RwLock, download_all_remote_layers_task_info: RwLock>, @@ -380,33 +414,59 @@ pub struct WalReceiverInfo { pub last_received_msg_ts: u128, } -/// /// Information about how much history needs to be retained, needed by /// Garbage Collection. -/// -pub struct GcInfo { +#[derive(Default)] +pub(crate) struct GcInfo { /// Specific LSNs that are needed. /// /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub retain_lsns: Vec, + pub(crate) retain_lsns: Vec, - /// In addition to 'retain_lsns', keep everything newer than this - /// point. + /// The cutoff coordinates, which are combined by selecting the minimum. + pub(crate) cutoffs: GcCutoffs, +} + +impl GcInfo { + pub(crate) fn min_cutoff(&self) -> Lsn { + self.cutoffs.select_min() + } +} + +/// The `GcInfo` component describing which Lsns need to be retained. +#[derive(Debug)] +pub(crate) struct GcCutoffs { + /// Keep everything newer than this point. /// /// This is calculated by subtracting 'gc_horizon' setting from /// last-record LSN /// /// FIXME: is this inclusive or exclusive? - pub horizon_cutoff: Lsn, + pub(crate) horizon: Lsn, /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this /// point. /// /// This is calculated by finding a number such that a record is needed for PITR /// if only if its LSN is larger than 'pitr_cutoff'. - pub pitr_cutoff: Lsn, + pub(crate) pitr: Lsn, +} + +impl Default for GcCutoffs { + fn default() -> Self { + Self { + horizon: Lsn::INVALID, + pitr: Lsn::INVALID, + } + } +} + +impl GcCutoffs { + fn select_min(&self) -> Lsn { + std::cmp::min(self.horizon, self.pitr) + } } /// An error happened in a get() operation. @@ -428,6 +488,51 @@ pub(crate) enum PageReconstructError { /// An error happened replaying WAL records #[error(transparent)] WalRedo(anyhow::Error), + + #[error("{0}")] + MissingKey(MissingKeyError), +} + +#[derive(Debug)] +pub struct MissingKeyError { + key: Key, + shard: ShardNumber, + cont_lsn: Lsn, + request_lsn: Lsn, + ancestor_lsn: Option, + traversal_path: Vec, + backtrace: Option, +} + +impl std::fmt::Display for MissingKeyError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}", + self.key, self.shard, self.cont_lsn, self.request_lsn + )?; + if let Some(ref ancestor_lsn) = self.ancestor_lsn { + write!(f, ", ancestor {}", ancestor_lsn)?; + } + + if !self.traversal_path.is_empty() { + writeln!(f)?; + } + + for (r, c, l) in &self.traversal_path { + writeln!( + f, + "layer traversal: result {:?}, cont_lsn {}, layer: {}", + r, c, l, + )?; + } + + if let Some(ref backtrace) = self.backtrace { + write!(f, "\n{}", backtrace)?; + } + + Ok(()) + } } impl PageReconstructError { @@ -439,6 +544,7 @@ impl PageReconstructError { AncestorLsnTimeout(_) => false, Cancelled | AncestorStopping(_) => true, WalRedo(_) => false, + MissingKey { .. } => false, } } } @@ -482,8 +588,8 @@ pub(crate) enum GetVectoredError { #[error("Requested at invalid LSN: {0}")] InvalidLsn(Lsn), - #[error("Requested key {0} not found")] - MissingKey(Key), + #[error("Requested key not found: {0}")] + MissingKey(MissingKeyError), #[error(transparent)] GetReadyAncestorError(GetReadyAncestorError), @@ -586,6 +692,19 @@ impl From for CreateImageLayersError { } } +impl From for PageReconstructError { + fn from(e: GetVectoredError) -> Self { + match e { + GetVectoredError::Cancelled => PageReconstructError::Cancelled, + GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")), + err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()), + GetVectoredError::MissingKey(err) => PageReconstructError::MissingKey(err), + GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err), + GetVectoredError::Other(err) => PageReconstructError::Other(err), + } + } +} + impl From for PageReconstructError { fn from(e: GetReadyAncestorError) -> Self { use GetReadyAncestorError::*; @@ -615,6 +734,23 @@ pub enum GetVectoredImpl { Vectored, } +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum GetImpl { + Legacy, + Vectored, +} + pub(crate) enum WaitLsnWaiter<'a> { Timeline(&'a Timeline), Tenant, @@ -676,16 +812,6 @@ impl Timeline { key: Key, lsn: Lsn, ctx: &RequestContext, - ) -> Result { - self.timeline_get_throttle.throttle(ctx, 1).await; - self.get_impl(key, lsn, ctx).await - } - /// Not subject to [`Self::timeline_get_throttle`]. - async fn get_impl( - &self, - key: Key, - lsn: Lsn, - ctx: &RequestContext, ) -> Result { if !lsn.is_valid() { return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN"))); @@ -696,13 +822,7 @@ impl Timeline { // page_service. debug_assert!(!self.shard_identity.is_key_disposable(&key)); - // XXX: structured stats collection for layer eviction here. - trace!( - "get page request for {}@{} from task kind {:?}", - key, - lsn, - ctx.task_kind() - ); + self.timeline_get_throttle.throttle(ctx, 1).await; // Check the page cache. We will get back the most recent page with lsn <= `lsn`. // The cached image can be returned directly if there is no WAL between the cached image @@ -725,12 +845,84 @@ impl Timeline { None => None, }; - let mut reconstruct_state = ValueReconstructState { - records: Vec::new(), - img: cached_page_img, - }; + match self.conf.get_impl { + GetImpl::Legacy => { + let reconstruct_state = ValueReconstructState { + records: Vec::new(), + img: cached_page_img, + }; - let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer(); + self.get_impl(key, lsn, reconstruct_state, ctx).await + } + GetImpl::Vectored => { + let keyspace = KeySpace { + ranges: vec![key..key.next()], + }; + + // Initialise the reconstruct state for the key with the cache + // entry returned above. + let mut reconstruct_state = ValuesReconstructState::new(); + let mut key_state = VectoredValueReconstructState::default(); + key_state.img = cached_page_img; + reconstruct_state.keys.insert(key, Ok(key_state)); + + let vectored_res = self + .get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx) + .await; + + if self.conf.validate_vectored_get { + self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) + .await; + } + + let key_value = vectored_res?.pop_first(); + match key_value { + Some((got_key, value)) => { + if got_key != key { + error!( + "Expected {}, but singular vectored get returned {}", + key, got_key + ); + Err(PageReconstructError::Other(anyhow!( + "Singular vectored get returned wrong key" + ))) + } else { + value + } + } + None => Err(PageReconstructError::MissingKey(MissingKeyError { + key, + shard: self.shard_identity.get_shard_number(&key), + cont_lsn: Lsn(0), + request_lsn: lsn, + ancestor_lsn: None, + traversal_path: Vec::new(), + backtrace: None, + })), + } + } + } + } + + /// Not subject to [`Self::timeline_get_throttle`]. + async fn get_impl( + &self, + key: Key, + lsn: Lsn, + mut reconstruct_state: ValueReconstructState, + ctx: &RequestContext, + ) -> Result { + // XXX: structured stats collection for layer eviction here. + trace!( + "get page request for {}@{} from task kind {:?}", + key, + lsn, + ctx.task_kind() + ); + + let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME + .for_get_kind(GetKind::Singular) + .start_timer(); let path = self .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx) .await?; @@ -740,7 +932,7 @@ impl Timeline { let res = self.reconstruct_value(key, lsn, reconstruct_state).await; let elapsed = start.elapsed(); crate::metrics::RECONSTRUCT_TIME - .for_result(&res) + .for_get_kind(GetKind::Singular) .observe(elapsed.as_secs_f64()); if cfg!(feature = "testing") && res.is_err() { @@ -753,7 +945,7 @@ impl Timeline { writeln!( msg, "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}", - layer(), + layer, ) .expect("string grows") }); @@ -782,7 +974,7 @@ impl Timeline { return Err(GetVectoredError::InvalidLsn(lsn)); } - let key_count = keyspace.total_size().try_into().unwrap(); + let key_count = keyspace.total_raw_size().try_into().unwrap(); if key_count > Timeline::MAX_GET_VECTORED_KEYS { return Err(GetVectoredError::Oversized(key_count)); } @@ -819,7 +1011,9 @@ impl Timeline { self.get_vectored_sequential_impl(keyspace, lsn, ctx).await } GetVectoredImpl::Vectored => { - let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await; + let vectored_res = self + .get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx) + .await; if self.conf.validate_vectored_get { self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) @@ -854,6 +1048,70 @@ impl Timeline { res } + /// Scan the keyspace and return all existing key-values in the keyspace. This currently uses vectored + /// get underlying. Normal vectored get would throw an error when a key in the keyspace is not found + /// during the search, but for the scan interface, it returns all existing key-value pairs, and does + /// not expect each single key in the key space will be found. The semantics is closer to the RocksDB + /// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored + /// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that + /// the scan operation will not cause OOM in the future. + #[allow(dead_code)] + pub(crate) async fn scan( + &self, + keyspace: KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result>, GetVectoredError> { + if !lsn.is_valid() { + return Err(GetVectoredError::InvalidLsn(lsn)); + } + + trace!( + "key-value scan request for {:?}@{} from task kind {:?}", + keyspace, + lsn, + ctx.task_kind() + ); + + // We should generalize this into Keyspace::contains in the future. + for range in &keyspace.ranges { + if range.start.field1 < METADATA_KEY_BEGIN_PREFIX + || range.end.field1 >= METADATA_KEY_END_PREFIX + { + return Err(GetVectoredError::Other(anyhow::anyhow!( + "only metadata keyspace can be scanned" + ))); + } + } + + let start = crate::metrics::SCAN_LATENCY + .for_task_kind(ctx.task_kind()) + .map(ScanLatencyOngoingRecording::start_recording); + + // start counting after throttle so that throttle time + // is always less than observation time + let throttled = self + .timeline_get_throttle + // assume scan = 1 quota for now until we find a better way to process this + .throttle(ctx, 1) + .await; + + let vectored_res = self + .get_vectored_impl( + keyspace.clone(), + lsn, + ValuesReconstructState::default(), + ctx, + ) + .await; + + if let Some(recording) = start { + recording.observe(throttled); + } + + vectored_res + } + /// Not subject to [`Self::timeline_get_throttle`]. pub(super) async fn get_vectored_sequential_impl( &self, @@ -862,18 +1120,47 @@ impl Timeline { ctx: &RequestContext, ) -> Result>, GetVectoredError> { let mut values = BTreeMap::new(); + for range in keyspace.ranges { let mut key = range.start; while key != range.end { - let block = self.get_impl(key, lsn, ctx).await; + let block = self + .get_impl(key, lsn, ValueReconstructState::default(), ctx) + .await; use PageReconstructError::*; match block { Err(Cancelled | AncestorStopping(_)) => { return Err(GetVectoredError::Cancelled) } - Err(Other(err)) if err.to_string().contains("could not find data for key") => { - return Err(GetVectoredError::MissingKey(key)) + Err(MissingKey(_)) + if NON_INHERITED_RANGE.contains(&key) + || NON_INHERITED_SPARSE_RANGE.contains(&key) => + { + // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range. + // When we add more types of keys into the page server, we should revisit this part of code and throw errors + // accordingly. + key = key.next(); + } + Err(MissingKey(err)) => { + return Err(GetVectoredError::MissingKey(err)); + } + Err(Other(err)) + if err + .to_string() + .contains("downloading evicted layer file failed") => + { + return Err(GetVectoredError::Other(err)) + } + Err(Other(err)) + if err + .chain() + .any(|cause| cause.to_string().contains("layer loading failed")) => + { + // The intent here is to achieve error parity with the vectored read path. + // When vectored read fails to load a layer it fails the whole read, hence + // we mimic this behaviour here to keep the validation happy. + return Err(GetVectoredError::Other(err)); } _ => { values.insert(key, block); @@ -890,14 +1177,27 @@ impl Timeline { &self, keyspace: KeySpace, lsn: Lsn, + mut reconstruct_state: ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { - let mut reconstruct_state = ValuesReconstructState::new(); + let get_kind = if keyspace.total_raw_size() == 1 { + GetKind::Singular + } else { + GetKind::Vectored + }; + let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME + .for_get_kind(get_kind) + .start_timer(); self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx) .await?; + get_data_timer.stop_and_record(); + let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME + .for_get_kind(get_kind) + .start_timer(); let mut results: BTreeMap> = BTreeMap::new(); + let layers_visited = reconstruct_state.get_layers_visited(); for (key, res) in reconstruct_state.keys { match res { Err(err) => { @@ -911,6 +1211,13 @@ impl Timeline { } } } + reconstruct_timer.stop_and_record(); + + // Note that this is an approximation. Tracking the exact number of layers visited + // per key requires virtually unbounded memory usage and is inefficient + // (i.e. segment tree tracking each range queried from a layer) + crate::metrics::VEC_READ_NUM_LAYERS_VISITED + .observe(layers_visited as f64 / results.len() as f64); Ok(results) } @@ -923,6 +1230,11 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) { + if keyspace.overlaps(&Key::metadata_key_range()) { + // skip validation for metadata key range + return; + } + let sequential_res = self .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx) .await; @@ -932,7 +1244,7 @@ impl Timeline { match (lhs, rhs) { (Oversized(l), Oversized(r)) => l == r, (InvalidLsn(l), InvalidLsn(r)) => l == r, - (MissingKey(l), MissingKey(r)) => l == r, + (MissingKey(l), MissingKey(r)) => l.key == r.key, (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true, (Other(_), Other(_)) => true, _ => false, @@ -946,6 +1258,11 @@ impl Timeline { panic!(concat!("Sequential get failed with {}, but vectored get did not", " - keyspace={:?} lsn={}"), seq_err, keyspace, lsn) }, + (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => { + // Sequential get runs after vectored get, so it is possible for the later + // to time out while waiting for its ancestor's Lsn to become ready and for the + // former to succeed (it essentially has a doubled wait time). + }, (Ok(_), Err(vec_err)) => { panic!(concat!("Vectored get failed with {}, but sequential get did not", " - keyspace={:?} lsn={}"), @@ -1026,6 +1343,12 @@ impl Timeline { self.last_record_lsn.load() } + /// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no + /// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn(). + pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver> { + self.last_record_lsn.status_receiver() + } + pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn.load() } @@ -1257,7 +1580,7 @@ impl Timeline { checkpoint_distance, self.get_last_record_lsn(), self.last_freeze_at.load(), - *self.last_freeze_ts.read().unwrap(), + open_layer.get_opened_at(), ) { match open_layer.info() { InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => { @@ -1344,7 +1667,7 @@ impl Timeline { background_jobs_can_start: Option<&completion::Barrier>, ctx: &RequestContext, ) { - if self.tenant_shard_id.is_zero() { + if self.tenant_shard_id.is_shard_zero() { // Logical size is only maintained accurately on shard zero. self.spawn_initial_logical_size_computation_task(ctx); } @@ -1622,7 +1945,7 @@ impl Timeline { checkpoint_distance: u64, projected_lsn: Lsn, last_freeze_at: Lsn, - last_freeze_ts: Instant, + opened_at: Instant, ) -> bool { let distance = projected_lsn.widening_sub(last_freeze_at); @@ -1648,13 +1971,13 @@ impl Timeline { ); true - } else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() { + } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() { info!( - "Will roll layer at {} with layer size {} due to time since last flush ({:?})", - projected_lsn, - layer_size, - last_freeze_ts.elapsed() - ); + "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})", + projected_lsn, + layer_size, + opened_at.elapsed() + ); true } else { @@ -1668,6 +1991,15 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10; // Private functions impl Timeline { + #[allow(dead_code)] + pub(crate) fn get_switch_to_aux_file_v2(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .switch_to_aux_file_v2 + .unwrap_or(self.conf.default_tenant_conf.switch_to_aux_file_v2) + } + pub(crate) fn get_lazy_slru_download(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -1869,11 +2201,7 @@ impl Timeline { write_lock: tokio::sync::Mutex::new(None), - gc_info: std::sync::RwLock::new(GcInfo { - retain_lsns: Vec::new(), - horizon_cutoff: Lsn(0), - pitr_cutoff: Lsn(0), - }), + gc_info: std::sync::RwLock::new(GcInfo::default()), latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), @@ -1887,12 +2215,18 @@ impl Timeline { // initial logical size is 0. LogicalSize::empty_initial() }, - partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))), + partitioning: tokio::sync::Mutex::new(( + (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()), + Lsn(0), + )), repartition_threshold: 0, last_image_layer_creation_check_at: AtomicLsn::new(0), last_received_wal: Mutex::new(None), - rel_size_cache: RwLock::new(HashMap::new()), + rel_size_cache: RwLock::new(RelSizeCache { + complete_as_of: disk_consistent_lsn, + map: HashMap::new(), + }), download_all_remote_layers_task_info: RwLock::new(None), @@ -2237,7 +2571,7 @@ impl Timeline { priority: GetLogicalSizePriority, ctx: &RequestContext, ) -> logical_size::CurrentLogicalSize { - if !self.tenant_shard_id.is_zero() { + if !self.tenant_shard_id.is_shard_zero() { // Logical size is only accurately maintained on shard zero: when called elsewhere, for example // when HTTP API is serving a GET for timeline zero, return zero return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero()); @@ -2533,7 +2867,7 @@ impl Timeline { crate::span::debug_assert_current_span_has_tenant_and_timeline_id(); // We should never be calculating logical sizes on shard !=0, because these shards do not have // accurate relation sizes, and they do not emit consumption metrics. - debug_assert!(self.tenant_shard_id.is_zero()); + debug_assert!(self.tenant_shard_id.is_shard_zero()); let guard = self .gate @@ -2692,7 +3026,7 @@ impl Timeline { } } -type TraversalId = String; +type TraversalId = Arc; trait TraversalLayerExt { fn traversal_id(&self) -> TraversalId; @@ -2700,13 +3034,13 @@ trait TraversalLayerExt { impl TraversalLayerExt for Layer { fn traversal_id(&self) -> TraversalId { - self.local_path().to_string() + Arc::clone(self.debug_str()) } } impl TraversalLayerExt for Arc { fn traversal_id(&self) -> TraversalId { - format!("timeline {} in-memory {self}", self.get_timeline_id()) + Arc::clone(self.local_path_str()) } } @@ -2735,7 +3069,7 @@ impl Timeline { let mut timeline = self; let mut read_count = scopeguard::guard(0, |cnt| { - crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64) + crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64) }); // For debugging purposes, collect the path of layers that we traversed @@ -2775,32 +3109,33 @@ impl Timeline { if prev <= cont_lsn { // Didn't make any progress in last iteration. Error out to avoid // getting stuck in the loop. - return Err(layer_traversal_error(format!( - "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", + return Err(PageReconstructError::MissingKey(MissingKeyError { key, - Lsn(cont_lsn.0 - 1), + shard: self.shard_identity.get_shard_number(&key), + cont_lsn: Lsn(cont_lsn.0 - 1), request_lsn, - timeline.ancestor_lsn - ), traversal_path)); + ancestor_lsn: Some(timeline.ancestor_lsn), + traversal_path, + backtrace: None, + })); } } prev_lsn = Some(cont_lsn); } ValueReconstructResult::Missing => { - return Err(layer_traversal_error( - if cfg!(test) { - format!( - "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}", - key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(), - ) - } else { - format!( - "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}", - key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn - ) - }, + return Err(PageReconstructError::MissingKey(MissingKeyError { + key, + shard: self.shard_identity.get_shard_number(&key), + cont_lsn, + request_lsn, + ancestor_lsn: None, traversal_path, - )); + backtrace: if cfg!(test) { + Some(std::backtrace::Backtrace::force_capture()) + } else { + None + }, + })); } } @@ -2847,12 +3182,8 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; - // metrics: open_layer does not count as fs access, so we are not updating `read_count` - traversal_path.push(( - result, - cont_lsn, - Box::new(move || open_layer.traversal_id()), - )); + *read_count += 1; + traversal_path.push((result, cont_lsn, open_layer.traversal_id())); continue 'outer; } } @@ -2878,12 +3209,8 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; - // metrics: open_layer does not count as fs access, so we are not updating `read_count` - traversal_path.push(( - result, - cont_lsn, - Box::new(move || frozen_layer.traversal_id()), - )); + *read_count += 1; + traversal_path.push((result, cont_lsn, frozen_layer.traversal_id())); continue 'outer; } } @@ -2891,7 +3218,6 @@ impl Timeline { if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { let layer = guard.get_from_desc(&layer); drop(guard); - // Get all the data needed to reconstruct the page version from this layer. // But if we have an older cached page image, no need to go past that. let lsn_floor = max(cached_lsn + 1, lsn_floor); @@ -2904,14 +3230,7 @@ impl Timeline { }; cont_lsn = lsn_floor; *read_count += 1; - traversal_path.push(( - result, - cont_lsn, - Box::new({ - let layer = layer.to_owned(); - move || layer.traversal_id() - }), - )); + traversal_path.push((result, cont_lsn, layer.traversal_id())); continue 'outer; } else if timeline.ancestor_timeline.is_some() { // Nothing on this timeline. Traverse to parent @@ -2964,11 +3283,22 @@ impl Timeline { .await?; keyspace.remove_overlapping_with(&completed); - if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() { + + // Do not descend into the ancestor timeline for aux files. + // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid + // stalling compaction. + keyspace.remove_overlapping_with(&KeySpace { + ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE], + }); + + // Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look + // into ancestor timelines). TODO: is there any other metadata which we want to inherit? + if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() { break; } - cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); + // Take the min to avoid reconstructing a page with data newer than request Lsn. + cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1)); timeline_owned = timeline .get_ready_ancestor_timeline(ctx) .await @@ -2976,14 +3306,24 @@ impl Timeline { timeline = &*timeline_owned; } - if keyspace.total_size() != 0 { - return Err(GetVectoredError::MissingKey(keyspace.start().unwrap())); + if keyspace.total_raw_size() != 0 { + return Err(GetVectoredError::MissingKey(MissingKeyError { + key: keyspace.start().unwrap(), /* better if we can store the full keyspace */ + shard: self + .shard_identity + .get_shard_number(&keyspace.start().unwrap()), + cont_lsn, + request_lsn, + ancestor_lsn: Some(timeline.ancestor_lsn), + traversal_path: vec![], + backtrace: None, + })); } Ok(()) } - /// Collect the reconstruct data for a ketspace from the specified timeline. + /// Collect the reconstruct data for a keyspace from the specified timeline. /// /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect /// the current keyspace. The current keyspace of the search at any given timeline @@ -3018,55 +3358,61 @@ impl Timeline { unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); completed_keyspace.merge(&keys_done_last_step); - let guard = timeline.layers.read().await; - let layers = guard.layer_map(); + // Do not descent any further if the last layer we visited + // completed all keys in the keyspace it inspected. This is not + // required for correctness, but avoids visiting extra layers + // which turns out to be a perf bottleneck in some cases. + if !unmapped_keyspace.is_empty() { + let guard = timeline.layers.read().await; + let layers = guard.layer_map(); - let in_memory_layer = layers.find_in_memory_layer(|l| { - let start_lsn = l.get_lsn_range().start; - cont_lsn > start_lsn - }); + let in_memory_layer = layers.find_in_memory_layer(|l| { + let start_lsn = l.get_lsn_range().start; + cont_lsn > start_lsn + }); - match in_memory_layer { - Some(l) => { - let lsn_range = l.get_lsn_range().start..cont_lsn; - fringe.update( - ReadableLayer::InMemoryLayer(l), - unmapped_keyspace.clone(), - lsn_range, - ); - } - None => { - for range in unmapped_keyspace.ranges.iter() { - let results = layers.range_search(range.clone(), cont_lsn); + match in_memory_layer { + Some(l) => { + let lsn_range = l.get_lsn_range().start..cont_lsn; + fringe.update( + ReadableLayer::InMemoryLayer(l), + unmapped_keyspace.clone(), + lsn_range, + ); + } + None => { + for range in unmapped_keyspace.ranges.iter() { + let results = layers.range_search(range.clone(), cont_lsn); - results - .found - .into_iter() - .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { - ( - ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)), - keyspace_accum.to_keyspace(), - lsn_floor..cont_lsn, - ) - }) - .for_each(|(layer, keyspace, lsn_range)| { - fringe.update(layer, keyspace, lsn_range) - }); + results + .found + .into_iter() + .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { + ( + ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)), + keyspace_accum.to_keyspace(), + lsn_floor..cont_lsn, + ) + }) + .for_each(|(layer, keyspace, lsn_range)| { + fringe.update(layer, keyspace, lsn_range) + }); + } } } - } - // It's safe to drop the layer map lock after planning the next round of reads. - // The fringe keeps readable handles for the layers which are safe to read even - // if layers were compacted or flushed. - // - // The more interesting consideration is: "Why is the read algorithm still correct - // if the layer map changes while it is operating?". Doing a vectored read on a - // timeline boils down to pushing an imaginary lsn boundary downwards for each range - // covered by the read. The layer map tells us how to move the lsn downwards for a - // range at *a particular point in time*. It is fine for the answer to be different - // at two different time points. - drop(guard); + // It's safe to drop the layer map lock after planning the next round of reads. + // The fringe keeps readable handles for the layers which are safe to read even + // if layers were compacted or flushed. + // + // The more interesting consideration is: "Why is the read algorithm still correct + // if the layer map changes while it is operating?". Doing a vectored read on a + // timeline boils down to pushing an imaginary lsn boundary downwards for each range + // covered by the read. The layer map tells us how to move the lsn downwards for a + // range at *a particular point in time*. It is fine for the answer to be different + // at two different time points. + drop(guard); + } if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { let next_cont_lsn = lsn_range.start; @@ -3081,6 +3427,8 @@ impl Timeline { unmapped_keyspace = keyspace_to_read; cont_lsn = next_cont_lsn; + + reconstruct_state.on_layer_visited(); } else { break; } @@ -3404,66 +3752,103 @@ impl Timeline { // files instead. This is possible as long as *all* the data imported into the // repository have the same LSN. let lsn_range = frozen_layer.get_lsn_range(); - let (layers_to_upload, delta_layer_to_add) = - if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { - #[cfg(test)] - match &mut *self.flush_loop_state.lock().unwrap() { - FlushLoopState::NotStarted | FlushLoopState::Exited => { - panic!("flush loop not running") - } - FlushLoopState::Running { - initdb_optimization_count, - .. - } => { + + // Whether to directly create image layers for this flush, or flush them as delta layers + let create_image_layer = + lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1); + + #[cfg(test)] + { + match &mut *self.flush_loop_state.lock().unwrap() { + FlushLoopState::NotStarted | FlushLoopState::Exited => { + panic!("flush loop not running") + } + FlushLoopState::Running { + expect_initdb_optimization, + initdb_optimization_count, + .. + } => { + if create_image_layer { *initdb_optimization_count += 1; - } - } - // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not - // require downloading anything during initial import. - let (partitioning, _lsn) = self - .repartition( - self.initdb_lsn, - self.get_compaction_target_size(), - EnumSet::empty(), - ctx, - ) - .await?; - - if self.cancel.is_cancelled() { - return Err(FlushLayerError::Cancelled); - } - - // For image layers, we add them immediately into the layer map. - ( - self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx) - .await?, - None, - ) - } else { - #[cfg(test)] - match &mut *self.flush_loop_state.lock().unwrap() { - FlushLoopState::NotStarted | FlushLoopState::Exited => { - panic!("flush loop not running") - } - FlushLoopState::Running { - expect_initdb_optimization, - .. - } => { + } else { assert!(!*expect_initdb_optimization, "expected initdb optimization"); } } - // Normal case, write out a L0 delta layer file. - // `create_delta_layer` will not modify the layer map. - // We will remove frozen layer and add delta layer in one atomic operation later. - let layer = self.create_delta_layer(&frozen_layer, ctx).await?; - ( - // FIXME: even though we have a single image and single delta layer assumption - // we push them to vec - vec![layer.clone()], - Some(layer), + } + } + + let (layers_to_upload, delta_layer_to_add) = if create_image_layer { + // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not + // require downloading anything during initial import. + let ((rel_partition, metadata_partition), _lsn) = self + .repartition( + self.initdb_lsn, + self.get_compaction_target_size(), + EnumSet::empty(), + ctx, ) + .await?; + + if self.cancel.is_cancelled() { + return Err(FlushLayerError::Cancelled); + } + + // For metadata, always create delta layers. + let delta_layer = if !metadata_partition.parts.is_empty() { + assert_eq!( + metadata_partition.parts.len(), + 1, + "currently sparse keyspace should only contain a single aux file keyspace" + ); + let metadata_keyspace = &metadata_partition.parts[0]; + assert_eq!( + metadata_keyspace.0.ranges.len(), + 1, + "aux file keyspace should be a single range" + ); + self.create_delta_layer( + &frozen_layer, + ctx, + Some(metadata_keyspace.0.ranges[0].clone()), + ) + .await? + } else { + None }; + // For image layers, we add them immediately into the layer map. + let mut layers_to_upload = Vec::new(); + layers_to_upload.extend( + self.create_image_layers( + &rel_partition, + self.initdb_lsn, + ImageLayerCreationMode::Initial, + ctx, + ) + .await?, + ); + + if let Some(delta_layer) = delta_layer { + layers_to_upload.push(delta_layer.clone()); + (layers_to_upload, Some(delta_layer)) + } else { + (layers_to_upload, None) + } + } else { + // Normal case, write out a L0 delta layer file. + // `create_delta_layer` will not modify the layer map. + // We will remove frozen layer and add delta layer in one atomic operation later. + let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else { + panic!("delta layer cannot be empty if no filter is applied"); + }; + ( + // FIXME: even though we have a single image and single delta layer assumption + // we push them to vec + vec![layer.clone()], + Some(layer), + ) + }; + pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable"); if self.cancel.is_cancelled() { @@ -3524,7 +3909,7 @@ impl Timeline { &self, disk_consistent_lsn: Lsn, layers_to_upload: impl IntoIterator, - ) -> anyhow::Result { + ) -> anyhow::Result<()> { // We can only save a valid 'prev_record_lsn' value on disk if we // flushed *all* in-memory changes to disk. We only track // 'prev_record_lsn' in memory for the latest processed record, so we @@ -3541,19 +3926,10 @@ impl Timeline { None }; - let ancestor_timeline_id = self - .ancestor_timeline - .as_ref() - .map(|ancestor| ancestor.timeline_id); - - let metadata = TimelineMetadata::new( + let update = crate::tenant::metadata::MetadataUpdate::new( disk_consistent_lsn, ondisk_prev_record_lsn, - ancestor_timeline_id, - self.ancestor_lsn, *self.latest_gc_cutoff_lsn.read(), - self.initdb_lsn, - self.pg_version, ); fail_point!("checkpoint-before-saving-metadata", |x| bail!( @@ -3565,10 +3941,10 @@ impl Timeline { for layer in layers_to_upload { remote_client.schedule_layer_file_upload(layer)?; } - remote_client.schedule_index_upload_for_metadata_update(&metadata)?; + remote_client.schedule_index_upload_for_metadata_update(&update)?; } - Ok(metadata) + Ok(()) } pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> { @@ -3592,12 +3968,18 @@ impl Timeline { self: &Arc, frozen_layer: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { + key_range: Option>, + ) -> anyhow::Result> { let self_clone = Arc::clone(self); let frozen_layer = Arc::clone(frozen_layer); let ctx = ctx.attached_child(); let work = async move { - let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?; + let Some(new_delta) = frozen_layer + .write_to_disk(&self_clone, &ctx, key_range) + .await? + else { + return Ok(None); + }; // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes. // We just need to fsync the directory in which these inodes are linked, // which we know to be the timeline directory. @@ -3616,7 +3998,7 @@ impl Timeline { .sync_all() .await .fatal_err("VirtualFile::sync_all timeline dir"); - anyhow::Ok(new_delta) + anyhow::Ok(Some(new_delta)) }; // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking. // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`. @@ -3643,19 +4025,20 @@ impl Timeline { partition_size: u64, flags: EnumSet, ctx: &RequestContext, - ) -> anyhow::Result<(KeyPartitioning, Lsn)> { + ) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> { let Ok(mut partitioning_guard) = self.partitioning.try_lock() else { // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` // and hence before the compaction task starts. anyhow::bail!("repartition() called concurrently, this should not happen"); }; - if lsn < partitioning_guard.1 { + let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard; + if lsn < *partition_lsn { anyhow::bail!("repartition() called with LSN going backwards, this should not happen"); } - let distance = lsn.0 - partitioning_guard.1 .0; - if partitioning_guard.1 != Lsn(0) + let distance = lsn.0 - partition_lsn.0; + if *partition_lsn != Lsn(0) && distance <= self.repartition_threshold && !flags.contains(CompactFlags::ForceRepartition) { @@ -3664,37 +4047,24 @@ impl Timeline { threshold = self.repartition_threshold, "no repartitioning needed" ); - return Ok((partitioning_guard.0.clone(), partitioning_guard.1)); + return Ok(( + (dense_partition.clone(), sparse_partition.clone()), + *partition_lsn, + )); } - let keyspace = self.collect_keyspace(lsn, ctx).await?; - let partitioning = keyspace.partition(partition_size); - - *partitioning_guard = (partitioning, lsn); + let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; + let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size); + let sparse_partitioning = SparseKeyPartitioning { + parts: vec![sparse_ks], + }; // no partitioning for metadata keys for now + *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn); Ok((partitioning_guard.0.clone(), partitioning_guard.1)) } // Is it time to create a new image layer for the given partition? async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool { - let last = self.last_image_layer_creation_check_at.load(); - if lsn != Lsn(0) { - let distance = lsn - .checked_sub(last) - .expect("Attempt to compact with LSN going backwards"); - - let min_distance = self.get_image_layer_creation_check_threshold() as u64 - * self.get_checkpoint_distance(); - - // Skip the expensive delta layer counting below if we've not ingested - // sufficient WAL since the last check. - if distance.0 < min_distance { - return false; - } - } - - self.last_image_layer_creation_check_at.store(lsn); - let threshold = self.get_image_creation_threshold(); let guard = self.layers.read().await; @@ -3744,12 +4114,12 @@ impl Timeline { false } - #[tracing::instrument(skip_all, fields(%lsn, %force))] + #[tracing::instrument(skip_all, fields(%lsn, %mode))] async fn create_image_layers( self: &Arc, partitioning: &KeyPartitioning, lsn: Lsn, - force: bool, + mode: ImageLayerCreationMode, ctx: &RequestContext, ) -> Result, CreateImageLayersError> { let timer = self.metrics.create_images_time_histo.start_timer(); @@ -3766,11 +4136,46 @@ impl Timeline { // image layers <100000000..100000099> and <200000000..200000199> are not completely covering it. let mut start = Key::MIN; + let check_for_image_layers = { + let last_checks_at = self.last_image_layer_creation_check_at.load(); + let distance = lsn + .checked_sub(last_checks_at) + .expect("Attempt to compact with LSN going backwards"); + let min_distance = self.get_image_layer_creation_check_threshold() as u64 + * self.get_checkpoint_distance(); + + // Skip the expensive delta layer counting if this timeline has not ingested sufficient + // WAL since the last check. + distance.0 >= min_distance + }; + + if check_for_image_layers { + self.last_image_layer_creation_check_at.store(lsn); + } + for partition in partitioning.parts.iter() { let img_range = start..partition.ranges.last().unwrap().end; - if !force && !self.time_for_new_image_layer(partition, lsn).await { - start = img_range.end; - continue; + + if partition.overlaps(&Key::metadata_key_range()) { + // TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a + // rather big change. Keep this patch small for now. + match mode { + ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => { + // skip image layer creation anyways for metadata keys. + start = img_range.end; + continue; + } + ImageLayerCreationMode::Initial => { + return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers"))); + } + } + } else if let ImageLayerCreationMode::Try = mode { + // check_for_image_layers = false -> skip + // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate + if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await { + start = img_range.end; + continue; + } } let mut image_layer_writer = ImageLayerWriter::new( @@ -3811,7 +4216,7 @@ impl Timeline { key = key.next(); // Maybe flush `key_rest_accum` - if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS + if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS || last_key_in_range { let results = self @@ -3850,7 +4255,7 @@ impl Timeline { }; // Write all the keys we just read into our new image layer. - image_layer_writer.put_image(img_key, img).await?; + image_layer_writer.put_image(img_key, img, ctx).await?; wrote_keys = true; } } @@ -3861,7 +4266,7 @@ impl Timeline { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. start = img_range.end; - let image_layer = image_layer_writer.finish(self).await?; + let image_layer = image_layer_writer.finish(self, ctx).await?; image_layers.push(image_layer); } else { // Special case: the image layer may be empty if this is a sharded tenant and the @@ -4054,7 +4459,7 @@ impl Timeline { Ok(()) } - /// Update information about which layer files need to be retained on + /// Find the Lsns above which layer files need to be retained on /// garbage collection. This is separate from actually performing the GC, /// and is updated more frequently, so that compaction can remove obsolete /// page versions more aggressively. @@ -4062,17 +4467,6 @@ impl Timeline { /// TODO: that's wishful thinking, compaction doesn't actually do that /// currently. /// - /// The caller specifies how much history is needed with the 3 arguments: - /// - /// retain_lsns: keep a version of each page at these LSNs - /// cutoff_horizon: also keep everything newer than this LSN - /// pitr: the time duration required to keep data for PITR - /// - /// The 'retain_lsns' list is currently used to prevent removing files that - /// are needed by child timelines. In the future, the user might be able to - /// name additional points in time to retain. The caller is responsible for - /// collecting that information. - /// /// The 'cutoff_horizon' point is used to retain recent versions that might still be /// needed by read-only nodes. (As of this writing, the caller just passes /// the latest LSN subtracted by a constant, and doesn't do anything smart @@ -4080,23 +4474,22 @@ impl Timeline { /// /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine /// whether a record is needed for PITR. - /// - /// NOTE: This function holds a short-lived lock to protect the 'gc_info' - /// field, so that the three values passed as argument are stored - /// atomically. But the caller is responsible for ensuring that no new - /// branches are created that would need to be included in 'retain_lsns', - /// for example. The caller should hold `Tenant::gc_cs` lock to ensure - /// that. - /// #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] - pub(super) async fn update_gc_info( + pub(super) async fn find_gc_cutoffs( &self, - retain_lsns: Vec, cutoff_horizon: Lsn, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { + let _timer = self + .metrics + .find_gc_cutoffs_histo + .start_timer() + .record_on_drop(); + + pausable_failpoint!("Timeline::find_gc_cutoffs-pausable"); + // First, calculate pitr_cutoff_timestamp and then convert it to LSN. // // Some unit tests depend on garbage-collection working even when @@ -4142,19 +4535,14 @@ impl Timeline { *self.get_latest_gc_cutoff_lsn() } } else { - // No time-based retention was configured. Set time-based cutoff to - // same as LSN based. - cutoff_horizon + // No time-based retention was configured. Interpret this as "keep no history". + self.get_last_record_lsn() }; - // Grab the lock and update the values - *self.gc_info.write().unwrap() = GcInfo { - retain_lsns, - horizon_cutoff: cutoff_horizon, - pitr_cutoff, - }; - - Ok(()) + Ok(GcCutoffs { + horizon: cutoff_horizon, + pitr: pitr_cutoff, + }) } /// Garbage collect layer files on a timeline that are no longer needed. @@ -4183,8 +4571,8 @@ impl Timeline { let (horizon_cutoff, pitr_cutoff, retain_lsns) = { let gc_info = self.gc_info.read().unwrap(); - let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn()); - let pitr_cutoff = gc_info.pitr_cutoff; + let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn()); + let pitr_cutoff = gc_info.cutoffs.pitr; let retain_lsns = gc_info.retain_lsns.clone(); (horizon_cutoff, pitr_cutoff, retain_lsns) }; @@ -4664,35 +5052,7 @@ impl Timeline { } } -type TraversalPathItem = ( - ValueReconstructResult, - Lsn, - Box TraversalId>, -); - -/// Helper function for get_reconstruct_data() to add the path of layers traversed -/// to an error, as anyhow context information. -fn layer_traversal_error(msg: String, path: Vec) -> PageReconstructError { - // We want the original 'msg' to be the outermost context. The outermost context - // is the most high-level information, which also gets propagated to the client. - let mut msg_iter = path - .into_iter() - .map(|(r, c, l)| { - format!( - "layer traversal: result {:?}, cont_lsn {}, layer: {}", - r, - c, - l(), - ) - }) - .chain(std::iter::once(msg)); - // Construct initial message from the first traversed layer - let err = anyhow!(msg_iter.next().unwrap()); - - // Append all subsequent traversals, and the error message 'msg', as contexts. - let msg = msg_iter.fold(err, |err, msg| err.context(msg)); - PageReconstructError::from(msg) -} +type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId); struct TimelineWriterState { open_layer: Arc, @@ -4703,23 +5063,16 @@ struct TimelineWriterState { max_lsn: Option, // Cached details of the last freeze. Avoids going trough the atomic/lock on every put. cached_last_freeze_at: Lsn, - cached_last_freeze_ts: Instant, } impl TimelineWriterState { - fn new( - open_layer: Arc, - current_size: u64, - last_freeze_at: Lsn, - last_freeze_ts: Instant, - ) -> Self { + fn new(open_layer: Arc, current_size: u64, last_freeze_at: Lsn) -> Self { Self { open_layer, current_size, prev_lsn: None, max_lsn: None, cached_last_freeze_at: last_freeze_at, - cached_last_freeze_ts: last_freeze_ts, } } } @@ -4818,12 +5171,10 @@ impl<'a> TimelineWriter<'a> { let initial_size = layer.size().await?; let last_freeze_at = self.last_freeze_at.load(); - let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); self.write_guard.replace(TimelineWriterState::new( layer, initial_size, last_freeze_at, - last_freeze_ts, )); Ok(()) @@ -4870,7 +5221,7 @@ impl<'a> TimelineWriter<'a> { self.get_checkpoint_distance(), lsn, state.cached_last_freeze_at, - state.cached_last_freeze_ts, + state.open_layer.get_opened_at(), ) { OpenLayerAction::Roll } else { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 8075775bbc..1088101a13 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -9,13 +9,13 @@ use std::ops::{Deref, Range}; use std::sync::Arc; use super::layer_manager::LayerManager; -use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline}; +use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline}; use anyhow::{anyhow, Context}; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; -use pageserver_api::shard::TenantShardId; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; use tokio_util::sync::CancellationToken; use tracing::{debug, info, info_span, trace, warn, Instrument}; use utils::id::TimelineId; @@ -102,7 +102,7 @@ impl Timeline { ) .await { - Ok((partitioning, lsn)) => { + Ok(((dense_partitioning, sparse_partitioning), lsn)) => { // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them let image_ctx = RequestContextBuilder::extend(ctx) .access_stats_behavior(AccessStatsBehavior::Skip) @@ -115,17 +115,37 @@ impl Timeline { // 3. Create new image layers for partitions that have been modified // "enough". - let layers = self + let dense_layers = self .create_image_layers( - &partitioning, + &dense_partitioning, lsn, - flags.contains(CompactFlags::ForceImageLayerCreation), + if flags.contains(CompactFlags::ForceImageLayerCreation) { + ImageLayerCreationMode::Force + } else { + ImageLayerCreationMode::Try + }, &image_ctx, ) .await .map_err(anyhow::Error::from)?; - self.upload_new_image_layers(layers)?; + // For now, nothing will be produced... + let sparse_layers = self + .create_image_layers( + &sparse_partitioning.clone().into_dense(), + lsn, + if flags.contains(CompactFlags::ForceImageLayerCreation) { + ImageLayerCreationMode::Force + } else { + ImageLayerCreationMode::Try + }, + &image_ctx, + ) + .await + .map_err(anyhow::Error::from)?; + assert!(sparse_layers.is_empty()); + + self.upload_new_image_layers(dense_layers)?; } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -500,7 +520,7 @@ impl Timeline { writer .take() .unwrap() - .finish(prev_key.unwrap().next(), self) + .finish(prev_key.unwrap().next(), self, ctx) .await?, ); writer = None; @@ -542,7 +562,11 @@ impl Timeline { ); } - writer.as_mut().unwrap().put_value(key, lsn, value).await?; + writer + .as_mut() + .unwrap() + .put_value(key, lsn, value, ctx) + .await?; } else { debug!( "Dropping key {} during compaction (it belongs on shard {:?})", @@ -558,7 +582,7 @@ impl Timeline { prev_key = Some(key); } if let Some(writer) = writer { - new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?); + new_layers.push(writer.finish(prev_key.unwrap().next(), self, ctx).await?); } // Sync layers @@ -758,8 +782,9 @@ impl Timeline { return Err(CompactionError::ShuttingDown); } - let keyspace = self.collect_keyspace(end_lsn, ctx).await?; - let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace)); + let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?; + // TODO(chi): ignore sparse_keyspace for now, compact it in the future. + let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks)); pageserver_compaction::compact_tiered::compact_tiered( &mut adaptor, @@ -831,6 +856,10 @@ impl CompactionJobExecutor for TimelineAdaptor { type RequestContext = crate::context::RequestContext; + fn get_shard_identity(&self) -> &ShardIdentity { + self.timeline.get_shard_identity() + } + async fn get_layers( &mut self, key_range: &Range, @@ -947,7 +976,7 @@ impl CompactionJobExecutor for TimelineAdaptor { let value = val.load(ctx).await?; - writer.put_value(key, lsn, value).await?; + writer.put_value(key, lsn, value, ctx).await?; prev = Some((key, lsn)); } @@ -963,7 +992,7 @@ impl CompactionJobExecutor for TimelineAdaptor { }); let new_delta_layer = writer - .finish(prev.unwrap().0.next(), &self.timeline) + .finish(prev.unwrap().0.next(), &self.timeline, ctx) .await?; self.new_deltas.push(new_delta_layer); @@ -1033,11 +1062,11 @@ impl TimelineAdaptor { } } }; - image_layer_writer.put_image(key, img).await?; + image_layer_writer.put_image(key, img, ctx).await?; key = key.next(); } } - let image_layer = image_layer_writer.finish(&self.timeline).await?; + let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?; self.new_images.push(image_layer); diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 522c5b57de..3567761b9a 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -188,24 +188,10 @@ impl Timeline { ) -> ControlFlow<()> { let now = SystemTime::now(); - let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit( - BackgroundLoopKind::Eviction, - ctx, - ); + let permit = self.acquire_imitation_permit(cancel, ctx).await?; - let _permit = tokio::select! { - permit = acquire_permit => permit, - _ = cancel.cancelled() => return ControlFlow::Break(()), - _ = self.cancel.cancelled() => return ControlFlow::Break(()), - }; - - match self - .imitate_layer_accesses(tenant, p, cancel, gate, ctx) - .await - { - ControlFlow::Break(()) => return ControlFlow::Break(()), - ControlFlow::Continue(()) => (), - } + self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx) + .await?; #[derive(Debug, Default)] struct EvictionStats { @@ -330,19 +316,27 @@ impl Timeline { gate: &GateGuard, ctx: &RequestContext, ) -> ControlFlow<()> { + let permit = self.acquire_imitation_permit(cancel, ctx).await?; + + self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx) + .await + } + + async fn acquire_imitation_permit( + &self, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> { let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit( BackgroundLoopKind::Eviction, ctx, ); - let _permit = tokio::select! { - permit = acquire_permit => permit, - _ = cancel.cancelled() => return ControlFlow::Break(()), - _ = self.cancel.cancelled() => return ControlFlow::Break(()), - }; - - self.imitate_layer_accesses(tenant, p, cancel, gate, ctx) - .await + tokio::select! { + permit = acquire_permit => ControlFlow::Continue(permit), + _ = cancel.cancelled() => ControlFlow::Break(()), + _ = self.cancel.cancelled() => ControlFlow::Break(()), + } } /// If we evict layers but keep cached values derived from those layers, then @@ -376,9 +370,10 @@ impl Timeline { p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, gate: &GateGuard, + permit: tokio::sync::SemaphorePermit<'static>, ctx: &RequestContext, ) -> ControlFlow<()> { - if !self.tenant_shard_id.is_zero() { + if !self.tenant_shard_id.is_shard_zero() { // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size // for consumption metrics (consumption metrics are only sent from shard 0). We may therefore // skip imitating logical size accesses for eviction purposes. @@ -408,7 +403,28 @@ impl Timeline { // Make one of the tenant's timelines draw the short straw and run the calculation. // The others wait until the calculation is done so that they take into account the // imitated accesses that the winner made. - let mut state = tenant.eviction_task_tenant_state.lock().await; + let (mut state, _permit) = { + if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() { + (locked, permit) + } else { + // we might need to wait for a long time here in case of pathological synthetic + // size calculation performance + drop(permit); + let locked = tokio::select! { + locked = tenant.eviction_task_tenant_state.lock() => locked, + _ = self.cancel.cancelled() => { + return ControlFlow::Break(()) + }, + _ = cancel.cancelled() => { + return ControlFlow::Break(()) + } + }; + // then reacquire -- this will be bad if there is a lot of traffic, but because we + // released the permit, the overall latency will be much better. + let permit = self.acquire_imitation_permit(cancel, ctx).await?; + (locked, permit) + } + }; match state.last_layer_access_imitation { Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ } _ => { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index dae31934ad..991e4ac045 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -22,10 +22,12 @@ use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeli use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use pageserver_api::models::TimelineState; -use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey; -use storage_broker::proto::SafekeeperTimelineInfo; -use storage_broker::proto::SubscribeSafekeeperInfoRequest; + use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, + SubscribeByFilterRequest, TypeSubscription, TypedMessage, +}; use storage_broker::{BrokerClientChannel, Code, Streaming}; use tokio_util::sync::CancellationToken; use tracing::*; @@ -89,6 +91,14 @@ pub(super) async fn connection_manager_loop_step( .timeline .subscribe_for_state_updates(); + let mut wait_lsn_status = connection_manager_state + .timeline + .subscribe_for_wait_lsn_updates(); + + // TODO: create a separate config option for discovery request interval + let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout; + let mut last_discovery_ts: Option = None; + // Subscribe to the broker updates. Stream shares underlying TCP connection // with other streams on this client (other connection managers). When // object goes out of scope, stream finishes in drop() automatically. @@ -97,10 +107,12 @@ pub(super) async fn connection_manager_loop_step( loop { let time_until_next_retry = connection_manager_state.time_until_next_retry(); + let any_activity = connection_manager_state.wal_connection.is_some() + || !connection_manager_state.wal_stream_candidates.is_empty(); // These things are happening concurrently: // - // - cancellation request + // - cancellation request // - keep receiving WAL on the current connection // - if the shared state says we need to change connection, disconnect and return // - this runs in a separate task and we receive updates via a watch channel @@ -108,6 +120,7 @@ pub(super) async fn connection_manager_loop_step( // - receive updates from broker // - this might change the current desired connection // - timeline state changes to something that does not allow walreceiver to run concurrently + // - if there's no connection and no candidates, try to send a discovery request // NB: make sure each of the select expressions are cancellation-safe // (no need for arms to be cancellation-safe). @@ -214,6 +227,65 @@ pub(super) async fn connection_manager_loop_step( } } } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"), + + Some(()) = async { + // Reminder: this match arm needs to be cancellation-safe. + // Calculating time needed to wait until sending the next discovery request. + // Current implementation is conservative and sends discovery requests only when there are no candidates. + + if any_activity { + // No need to send discovery requests if there is an active connection or candidates. + return None; + } + + // Waiting for an active wait_lsn request. + while wait_lsn_status.borrow().is_none() { + if wait_lsn_status.changed().await.is_err() { + // wait_lsn_status channel was closed, exiting + warn!("wait_lsn_status channel was closed in connection_manager_loop_step"); + return None; + } + } + + // All preconditions met, preparing to send a discovery request. + let now = std::time::Instant::now(); + let next_discovery_ts = last_discovery_ts + .map(|ts| ts + discovery_request_interval) + .unwrap_or_else(|| now); + + if next_discovery_ts > now { + // Prevent sending discovery requests too frequently. + tokio::time::sleep(next_discovery_ts - now).await; + } + + let tenant_timeline_id = Some(ProtoTenantTimelineId { + tenant_id: id.tenant_id.as_ref().to_owned(), + timeline_id: id.timeline_id.as_ref().to_owned(), + }); + let request = SafekeeperDiscoveryRequest { tenant_timeline_id }; + let msg = TypedMessage { + r#type: MessageType::SafekeeperDiscoveryRequest as i32, + safekeeper_timeline_info: None, + safekeeper_discovery_request: Some(request), + safekeeper_discovery_response: None, + }; + + last_discovery_ts = Some(std::time::Instant::now()); + debug!("No active connection and no candidates, sending discovery request to the broker"); + + // Cancellation safety: we want to send a message to the broker, but publish_one() + // function can get cancelled by the other select! arm. This is absolutely fine, because + // we just want to receive broker updates and discovery is not important if we already + // receive updates. + // + // It is possible that `last_discovery_ts` will be updated, but the message will not be sent. + // This is totally fine because of the reason above. + + // This is a fire-and-forget request, we don't care about the response + let _ = broker_client.publish_one(msg).await; + debug!("Discovery request sent to the broker"); + None + } => {} } if let Some(new_candidate) = connection_manager_state.next_connection_candidate() { @@ -231,7 +303,7 @@ async fn subscribe_for_timeline_updates( broker_client: &mut BrokerClientChannel, id: TenantTimelineId, cancel: &CancellationToken, -) -> Result, Cancelled> { +) -> Result, Cancelled> { let mut attempt = 0; loop { exponential_backoff( @@ -244,17 +316,27 @@ async fn subscribe_for_timeline_updates( attempt += 1; // subscribe to the specific timeline - let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId { - tenant_id: id.tenant_id.as_ref().to_owned(), - timeline_id: id.timeline_id.as_ref().to_owned(), - }); - let request = SubscribeSafekeeperInfoRequest { - subscription_key: Some(key), + let request = SubscribeByFilterRequest { + types: vec![ + TypeSubscription { + r#type: MessageType::SafekeeperTimelineInfo as i32, + }, + TypeSubscription { + r#type: MessageType::SafekeeperDiscoveryResponse as i32, + }, + ], + tenant_timeline_id: Some(FilterTenantTimelineId { + enabled: true, + tenant_timeline_id: Some(ProtoTenantTimelineId { + tenant_id: id.tenant_id.as_ref().to_owned(), + timeline_id: id.timeline_id.as_ref().to_owned(), + }), + }), }; match { tokio::select! { - r = broker_client.subscribe_safekeeper_info(request) => { r } + r = broker_client.subscribe_by_filter(request) => { r } _ = cancel.cancelled() => { return Err(Cancelled); } } } { @@ -398,7 +480,7 @@ struct RetryInfo { /// Data about the timeline to connect to, received from the broker. #[derive(Debug, Clone)] struct BrokerSkTimeline { - timeline: SafekeeperTimelineInfo, + timeline: SafekeeperDiscoveryResponse, /// Time at which the data was fetched from the broker last time, to track the stale data. latest_update: NaiveDateTime, } @@ -606,7 +688,41 @@ impl ConnectionManagerState { } /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key. - fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) { + fn register_timeline_update(&mut self, typed_msg: TypedMessage) { + let mut is_discovery = false; + let timeline_update = match typed_msg.r#type() { + MessageType::SafekeeperTimelineInfo => { + let info = match typed_msg.safekeeper_timeline_info { + Some(info) => info, + None => { + warn!("bad proto message from broker: no safekeeper_timeline_info"); + return; + } + }; + SafekeeperDiscoveryResponse { + safekeeper_id: info.safekeeper_id, + tenant_timeline_id: info.tenant_timeline_id, + commit_lsn: info.commit_lsn, + safekeeper_connstr: info.safekeeper_connstr, + availability_zone: info.availability_zone, + } + } + MessageType::SafekeeperDiscoveryResponse => { + is_discovery = true; + match typed_msg.safekeeper_discovery_response { + Some(response) => response, + None => { + warn!("bad proto message from broker: no safekeeper_discovery_response"); + return; + } + } + } + _ => { + // unexpected message + return; + } + }; + WALRECEIVER_BROKER_UPDATES.inc(); let new_safekeeper_id = NodeId(timeline_update.safekeeper_id); @@ -619,7 +735,11 @@ impl ConnectionManagerState { ); if old_entry.is_none() { - info!("New SK node was added: {new_safekeeper_id}"); + info!( + ?is_discovery, + %new_safekeeper_id, + "New SK node was added", + ); WALRECEIVER_CANDIDATES_ADDED.inc(); } } @@ -818,7 +938,7 @@ impl ConnectionManagerState { fn select_connection_candidate( &self, node_to_omit: Option, - ) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> { + ) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> { self.applicable_connection_candidates() .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit) .max_by_key(|(_, info, _)| info.commit_lsn) @@ -828,7 +948,7 @@ impl ConnectionManagerState { /// Some safekeepers are filtered by the retry cooldown. fn applicable_connection_candidates( &self, - ) -> impl Iterator { + ) -> impl Iterator { let now = Utc::now().naive_utc(); self.wal_stream_candidates @@ -968,19 +1088,11 @@ mod tests { latest_update: NaiveDateTime, ) -> BrokerSkTimeline { BrokerSkTimeline { - timeline: SafekeeperTimelineInfo { + timeline: SafekeeperDiscoveryResponse { safekeeper_id: 0, tenant_timeline_id: None, - term: 0, - last_log_term: 0, - flush_lsn: 0, commit_lsn, - backup_lsn: 0, - remote_consistent_lsn: 0, - peer_horizon_lsn: 0, - local_start_lsn: 0, safekeeper_connstr: safekeeper_connstr.to_owned(), - http_connstr: safekeeper_connstr.to_owned(), availability_zone: None, }, latest_update, @@ -1423,7 +1535,7 @@ mod tests { let harness = TenantHarness::create("switch_to_same_availability_zone")?; let mut state = dummy_state(&harness).await; - state.conf.availability_zone = test_az.clone(); + state.conf.availability_zone.clone_from(&test_az); let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1456,7 +1568,7 @@ mod tests { // We have another safekeeper with the same commit_lsn, and it have the same availability zone as // the current pageserver. let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now); - same_az_sk.timeline.availability_zone = test_az.clone(); + same_az_sk.timeline.availability_zone.clone_from(&test_az); state.wal_stream_candidates = HashMap::from([ ( diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 3f3419e886..c6ee6b90c4 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -427,7 +427,7 @@ pub(super) async fn handle_walreceiver_connection( // Send the replication feedback message. // Regular standby_status_update fields are put into this message. - let current_timeline_size = if timeline.tenant_shard_id.is_zero() { + let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() { timeline .get_current_logical_size( crate::tenant::timeline::GetLogicalSizePriority::User, diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 3a6950cf88..91934d5e0e 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -61,18 +61,18 @@ pub struct VectoredRead { } impl VectoredRead { - pub fn size(&self) -> usize { + pub(crate) fn size(&self) -> usize { (self.end - self.start) as usize } } #[derive(Eq, PartialEq)] -enum VectoredReadExtended { +pub(crate) enum VectoredReadExtended { Yes, No, } -struct VectoredReadBuilder { +pub(crate) struct VectoredReadBuilder { start: u64, end: u64, blobs_at: VecMap, @@ -80,7 +80,17 @@ struct VectoredReadBuilder { } impl VectoredReadBuilder { - fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self { + /// Start building a new vectored read. + /// + /// Note that by design, this does not check against reading more than `max_read_size` to + /// support reading larger blobs than the configuration value. The builder will be single use + /// however after that. + pub(crate) fn new( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: usize, + ) -> Self { let mut blobs_at = VecMap::default(); blobs_at .append(start_offset, meta) @@ -97,7 +107,8 @@ impl VectoredReadBuilder { /// Attempt to extend the current read with a new blob if the start /// offset matches with the current end of the vectored read /// and the resuting size is below the max read size - fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { + pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { + tracing::trace!(start, end, "trying to extend"); let size = (end - start) as usize; if self.end == start && self.size() + size <= self.max_read_size { self.end = end; @@ -111,11 +122,11 @@ impl VectoredReadBuilder { VectoredReadExtended::No } - fn size(&self) -> usize { + pub(crate) fn size(&self) -> usize { (self.end - self.start) as usize } - fn build(self) -> VectoredRead { + pub(crate) fn build(self) -> VectoredRead { VectoredRead { start: self.start, end: self.end, diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 0cf6a0019b..a17488a286 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -10,6 +10,7 @@ //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! +use crate::context::RequestContext; use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; use crate::page_cache::PageWriteGuard; @@ -32,11 +33,11 @@ pub use io_engine::feature_test as io_engine_feature_test; pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult; mod metadata; mod open_options; +use self::owned_buffers_io::write::OwnedAsyncWriter; pub(crate) use io_engine::IoEngineKind; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; -#[cfg_attr(not(target_os = "linux"), allow(dead_code))] pub(crate) mod owned_buffers_io { //! Abstractions for IO with owned buffers. //! @@ -615,6 +616,7 @@ impl VirtualFile { &self, buf: B, mut offset: u64, + ctx: &RequestContext, ) -> (B::Buf, Result<(), Error>) { let buf_len = buf.bytes_init(); if buf_len == 0 { @@ -623,7 +625,7 @@ impl VirtualFile { let mut buf = buf.slice(0..buf_len); while !buf.is_empty() { let res; - (buf, res) = self.write_at(buf, offset).await; + (buf, res) = self.write_at(buf, offset, ctx).await; match res { Ok(0) => { return ( @@ -652,6 +654,7 @@ impl VirtualFile { pub async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + ctx: &RequestContext, ) -> (B::Buf, Result) { let nbytes = buf.bytes_init(); if nbytes == 0 { @@ -660,7 +663,7 @@ impl VirtualFile { let mut buf = buf.slice(0..nbytes); while !buf.is_empty() { let res; - (buf, res) = self.write(buf).await; + (buf, res) = self.write(buf, ctx).await; match res { Ok(0) => { return ( @@ -684,9 +687,10 @@ impl VirtualFile { async fn write( &mut self, buf: Slice, + ctx: &RequestContext, ) -> (Slice, Result) { let pos = self.pos; - let (buf, res) = self.write_at(buf, pos).await; + let (buf, res) = self.write_at(buf, pos, ctx).await; let n = match res { Ok(n) => n, Err(e) => return (buf, Err(e)), @@ -724,6 +728,7 @@ impl VirtualFile { &self, buf: Slice, offset: u64, + _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ ) -> (Slice, Result) { let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, @@ -1083,6 +1088,18 @@ impl Drop for VirtualFile { } } +impl OwnedAsyncWriter for VirtualFile { + #[inline(always)] + async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ctx: &RequestContext, + ) -> std::io::Result<(usize, B::Buf)> { + let (buf, res) = VirtualFile::write_all(self, buf, ctx).await; + res.map(move |v| (v, buf)) + } +} + impl OpenFiles { fn new(num_slots: usize) -> OpenFiles { let mut slots = Box::new(Vec::with_capacity(num_slots)); @@ -1135,6 +1152,9 @@ fn get_open_files() -> &'static OpenFiles { #[cfg(test)] mod tests { + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use super::*; use rand::seq::SliceRandom; use rand::thread_rng; @@ -1166,10 +1186,11 @@ mod tests { &self, buf: B, offset: u64, + ctx: &RequestContext, ) -> Result<(), Error> { match self { MaybeVirtualFile::VirtualFile(file) => { - let (_buf, res) = file.write_all_at(buf, offset).await; + let (_buf, res) = file.write_all_at(buf, offset, ctx).await; res } MaybeVirtualFile::File(file) => { @@ -1190,10 +1211,11 @@ mod tests { async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + ctx: &RequestContext, ) -> Result<(), Error> { match self { MaybeVirtualFile::VirtualFile(file) => { - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res.map(|_| ()) } MaybeVirtualFile::File(file) => { @@ -1264,6 +1286,7 @@ mod tests { OF: Fn(Utf8PathBuf, OpenOptions) -> FT, FT: Future>, { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir(testname); std::fs::create_dir_all(&testdir)?; @@ -1277,7 +1300,7 @@ mod tests { .to_owned(), ) .await?; - file_a.write_all(b"foobar".to_vec()).await?; + file_a.write_all(b"foobar".to_vec(), &ctx).await?; // cannot read from a file opened in write-only mode let _ = file_a.read_string().await.unwrap_err(); @@ -1286,7 +1309,7 @@ mod tests { let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?; // cannot write to a file opened in read-only mode - let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err(); + let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err(); // Try simple read assert_eq!("foobar", file_a.read_string().await?); @@ -1328,8 +1351,8 @@ mod tests { .to_owned(), ) .await?; - file_b.write_all_at(b"BAR".to_vec(), 3).await?; - file_b.write_all_at(b"FOO".to_vec(), 0).await?; + file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?; + file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?; assert_eq!(file_b.read_string_at(2, 3).await?, "OBA"); diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs index 7505b7487e..55b1d0b46b 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs @@ -1,33 +1,46 @@ -use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile}; +use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter}; use tokio_epoll_uring::{BoundedBuf, IoBuf}; -pub struct Writer { - dst: VirtualFile, +pub struct Writer { + dst: W, bytes_amount: u64, } -impl Writer { - pub fn new(dst: VirtualFile) -> Self { +impl Writer { + pub fn new(dst: W) -> Self { Self { dst, bytes_amount: 0, } } + + pub fn bytes_written(&self) -> u64 { + self.bytes_amount + } + + pub fn as_inner(&self) -> &W { + &self.dst + } + /// Returns the wrapped `VirtualFile` object as well as the number /// of bytes that were written to it through this object. - pub fn into_inner(self) -> (u64, VirtualFile) { + #[cfg_attr(target_os = "macos", allow(dead_code))] + pub fn into_inner(self) -> (u64, W) { (self.bytes_amount, self.dst) } } -impl OwnedAsyncWriter for Writer { +impl OwnedAsyncWriter for Writer +where + W: OwnedAsyncWriter, +{ #[inline(always)] async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + ctx: &RequestContext, ) -> std::io::Result<(usize, B::Buf)> { - let (buf, res) = self.dst.write_all(buf).await; - let nwritten = res?; + let (nwritten, buf) = self.dst.write_all(buf, ctx).await?; self.bytes_amount += u64::try_from(nwritten).unwrap(); Ok((nwritten, buf)) } diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index f1812d9b51..885a9221c5 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,23 +1,26 @@ use bytes::BytesMut; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; +use crate::context::RequestContext; + /// A trait for doing owned-buffer write IO. /// Think [`tokio::io::AsyncWrite`] but with owned buffers. pub trait OwnedAsyncWriter { async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + ctx: &RequestContext, ) -> std::io::Result<(usize, B::Buf)>; } -/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers -/// into `BUFFER_SIZE`-sized writes. +/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch +/// small writes into larger writes of size [`Buffer::cap`]. /// /// # Passthrough Of Large Writers /// -/// Buffered writes larger than the `BUFFER_SIZE` cause the internal -/// buffer to be flushed, even if it is not full yet. Then, the large -/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`]. +/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`] +/// cause the internal buffer to be flushed prematurely so that the large +/// buffered write is passed through to the underlying [`OwnedAsyncWriter`]. /// /// This pass-through is generally beneficial for throughput, but if /// the storage backend of the [`OwnedAsyncWriter`] is a shared resource, @@ -25,93 +28,194 @@ pub trait OwnedAsyncWriter { /// /// In such cases, a different implementation that always buffers in memory /// may be preferable. -pub struct BufferedWriter { +pub struct BufferedWriter { writer: W, - // invariant: always remains Some(buf) - // with buf.capacity() == BUFFER_SIZE except - // - while IO is ongoing => goes back to Some() once the IO completed successfully - // - after an IO error => stays `None` forever - // In these exceptional cases, it's `None`. - buf: Option, + /// invariant: always remains Some(buf) except + /// - while IO is ongoing => goes back to Some() once the IO completed successfully + /// - after an IO error => stays `None` forever + /// In these exceptional cases, it's `None`. + buf: Option, } -impl BufferedWriter +impl BufferedWriter where + B: Buffer + Send, + Buf: IoBuf + Send, W: OwnedAsyncWriter, { - pub fn new(writer: W) -> Self { + pub fn new(writer: W, buf: B) -> Self { Self { writer, - buf: Some(BytesMut::with_capacity(BUFFER_SIZE)), + buf: Some(buf), } } - pub async fn flush_and_into_inner(mut self) -> std::io::Result { - self.flush().await?; + pub fn as_inner(&self) -> &W { + &self.writer + } + + /// Panics if used after any of the write paths returned an error + pub fn inspect_buffer(&self) -> &B { + self.buf() + } + + #[cfg_attr(target_os = "macos", allow(dead_code))] + pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result { + self.flush(ctx).await?; + let Self { buf, writer } = self; assert!(buf.is_some()); Ok(writer) } - pub async fn write_buffered(&mut self, chunk: Slice) -> std::io::Result<()> - where - B: IoBuf + Send, - { + #[inline(always)] + fn buf(&self) -> &B { + self.buf + .as_ref() + .expect("must not use after we returned an error") + } + + #[cfg_attr(target_os = "macos", allow(dead_code))] + pub async fn write_buffered( + &mut self, + chunk: Slice, + ctx: &RequestContext, + ) -> std::io::Result<(usize, S)> { + let chunk_len = chunk.len(); // avoid memcpy for the middle of the chunk - if chunk.len() >= BUFFER_SIZE { - self.flush().await?; + if chunk.len() >= self.buf().cap() { + self.flush(ctx).await?; // do a big write, bypassing `buf` assert_eq!( self.buf .as_ref() .expect("must not use after an error") - .len(), + .pending(), 0 ); - let chunk_len = chunk.len(); - let (nwritten, chunk) = self.writer.write_all(chunk).await?; + let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?; assert_eq!(nwritten, chunk_len); - drop(chunk); - return Ok(()); + return Ok((nwritten, chunk)); } // in-memory copy the < BUFFER_SIZED tail of the chunk - assert!(chunk.len() < BUFFER_SIZE); - let mut chunk = &chunk[..]; + assert!(chunk.len() < self.buf().cap()); + let mut slice = &chunk[..]; + while !slice.is_empty() { + let buf = self.buf.as_mut().expect("must not use after an error"); + let need = buf.cap() - buf.pending(); + let have = slice.len(); + let n = std::cmp::min(need, have); + buf.extend_from_slice(&slice[..n]); + slice = &slice[n..]; + if buf.pending() >= buf.cap() { + assert_eq!(buf.pending(), buf.cap()); + self.flush(ctx).await?; + } + } + assert!(slice.is_empty(), "by now we should have drained the chunk"); + Ok((chunk_len, chunk.into_inner())) + } + + /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data. + /// + /// It is less performant because we always have to copy the borrowed data into the internal buffer + /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant + /// for large writes. + pub async fn write_buffered_borrowed( + &mut self, + mut chunk: &[u8], + ctx: &RequestContext, + ) -> std::io::Result { + let chunk_len = chunk.len(); while !chunk.is_empty() { let buf = self.buf.as_mut().expect("must not use after an error"); - let need = BUFFER_SIZE - buf.len(); + let need = buf.cap() - buf.pending(); let have = chunk.len(); let n = std::cmp::min(need, have); buf.extend_from_slice(&chunk[..n]); chunk = &chunk[n..]; - if buf.len() >= BUFFER_SIZE { - assert_eq!(buf.len(), BUFFER_SIZE); - self.flush().await?; + if buf.pending() >= buf.cap() { + assert_eq!(buf.pending(), buf.cap()); + self.flush(ctx).await?; } } - assert!(chunk.is_empty(), "by now we should have drained the chunk"); - Ok(()) + Ok(chunk_len) } - async fn flush(&mut self) -> std::io::Result<()> { + async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> { let buf = self.buf.take().expect("must not use after an error"); - if buf.is_empty() { + let buf_len = buf.pending(); + if buf_len == 0 { self.buf = Some(buf); - return std::io::Result::Ok(()); + return Ok(()); } - let buf_len = buf.len(); - let (nwritten, mut buf) = self.writer.write_all(buf).await?; + let (nwritten, io_buf) = self.writer.write_all(buf.flush(), ctx).await?; assert_eq!(nwritten, buf_len); - buf.clear(); - self.buf = Some(buf); + self.buf = Some(Buffer::reuse_after_flush(io_buf)); Ok(()) } } +/// A [`Buffer`] is used by [`BufferedWriter`] to batch smaller writes into larger ones. +pub trait Buffer { + type IoBuf: IoBuf; + + /// Capacity of the buffer. Must not change over the lifetime `self`.` + fn cap(&self) -> usize; + + /// Add data to the buffer. + /// Panics if there is not enough room to accomodate `other`'s content, i.e., + /// panics if `other.len() > self.cap() - self.pending()`. + fn extend_from_slice(&mut self, other: &[u8]); + + /// Number of bytes in the buffer. + fn pending(&self) -> usize; + + /// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data + /// so we can use [`tokio_epoll_uring`] to write it to disk. + fn flush(self) -> Slice; + + /// After the write to disk is done and we have gotten back the slice, + /// [`BufferedWriter`] uses this method to re-use the io buffer. + fn reuse_after_flush(iobuf: Self::IoBuf) -> Self; +} + +impl Buffer for BytesMut { + type IoBuf = BytesMut; + + #[inline(always)] + fn cap(&self) -> usize { + self.capacity() + } + + fn extend_from_slice(&mut self, other: &[u8]) { + BytesMut::extend_from_slice(self, other) + } + + #[inline(always)] + fn pending(&self) -> usize { + self.len() + } + + fn flush(self) -> Slice { + if self.is_empty() { + return self.slice_full(); + } + let len = self.len(); + self.slice(0..len) + } + + fn reuse_after_flush(mut iobuf: BytesMut) -> Self { + iobuf.clear(); + iobuf + } +} + impl OwnedAsyncWriter for Vec { async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + _: &RequestContext, ) -> std::io::Result<(usize, B::Buf)> { let nbytes = buf.bytes_init(); if nbytes == 0 { @@ -125,7 +229,11 @@ impl OwnedAsyncWriter for Vec { #[cfg(test)] mod tests { + use bytes::BytesMut; + use super::*; + use crate::context::{DownloadBehavior, RequestContext}; + use crate::task_mgr::TaskKind; #[derive(Default)] struct RecorderWriter { @@ -135,6 +243,7 @@ mod tests { async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + _: &RequestContext, ) -> std::io::Result<(usize, B::Buf)> { let nbytes = buf.bytes_init(); if nbytes == 0 { @@ -147,10 +256,14 @@ mod tests { } } + fn test_ctx() -> RequestContext { + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) + } + macro_rules! write { ($writer:ident, $data:literal) => {{ $writer - .write_buffered(::bytes::Bytes::from_static($data).slice_full()) + .write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx()) .await?; }}; } @@ -158,13 +271,13 @@ mod tests { #[tokio::test] async fn test_buffered_writes_only() -> std::io::Result<()> { let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::<2, _>::new(recorder); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); write!(writer, b"a"); write!(writer, b"b"); write!(writer, b"c"); write!(writer, b"d"); write!(writer, b"e"); - let recorder = writer.flush_and_into_inner().await?; + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( recorder.writes, vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")] @@ -175,12 +288,12 @@ mod tests { #[tokio::test] async fn test_passthrough_writes_only() -> std::io::Result<()> { let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::<2, _>::new(recorder); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); write!(writer, b"abc"); write!(writer, b"de"); write!(writer, b""); write!(writer, b"fghijk"); - let recorder = writer.flush_and_into_inner().await?; + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( recorder.writes, vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")] @@ -191,16 +304,45 @@ mod tests { #[tokio::test] async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> { let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::<2, _>::new(recorder); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); write!(writer, b"a"); write!(writer, b"bc"); write!(writer, b"d"); write!(writer, b"e"); - let recorder = writer.flush_and_into_inner().await?; + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( recorder.writes, vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")] ); Ok(()) } + + #[tokio::test] + async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> { + let ctx = test_ctx(); + let ctx = &ctx; + let recorder = RecorderWriter::default(); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + + writer.write_buffered_borrowed(b"abc", ctx).await?; + writer.write_buffered_borrowed(b"d", ctx).await?; + writer.write_buffered_borrowed(b"e", ctx).await?; + writer.write_buffered_borrowed(b"fg", ctx).await?; + writer.write_buffered_borrowed(b"hi", ctx).await?; + writer.write_buffered_borrowed(b"j", ctx).await?; + writer.write_buffered_borrowed(b"klmno", ctx).await?; + + let recorder = writer.flush_and_into_inner(ctx).await?; + assert_eq!( + recorder.writes, + { + let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"]; + expect + } + .iter() + .map(|v| v[..].to_vec()) + .collect::>() + ); + Ok(()) + } } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 9c7e8748d5..79f075b877 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -403,7 +403,7 @@ impl WalIngest { ); if !key_is_local { - if self.shard.is_zero() { + if self.shard.is_shard_zero() { // Shard 0 tracks relation sizes. Although we will not store this block, we will observe // its blkno in case it implicitly extends a relation. self.observe_decoded_block(modification, blk, ctx).await?; @@ -1034,7 +1034,7 @@ impl WalIngest { let nblocks = modification .tline - .get_rel_size(src_rel, Version::Modified(modification), true, ctx) + .get_rel_size(src_rel, Version::Modified(modification), ctx) .await?; let dst_rel = RelTag { spcnode: tablespace_id, @@ -1068,13 +1068,7 @@ impl WalIngest { let content = modification .tline - .get_rel_page_at_lsn( - src_rel, - blknum, - Version::Modified(modification), - true, - ctx, - ) + .get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx) .await?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; @@ -1242,7 +1236,7 @@ impl WalIngest { }; if modification .tline - .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .get_rel_exists(rel, Version::Modified(modification), ctx) .await? { self.put_rel_drop(modification, rel, ctx).await?; @@ -1541,7 +1535,7 @@ impl WalIngest { nblocks } else if !modification .tline - .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .get_rel_exists(rel, Version::Modified(modification), ctx) .await? { // create it with 0 size initially, the logic below will extend it @@ -1553,7 +1547,7 @@ impl WalIngest { } else { modification .tline - .get_rel_size(rel, Version::Modified(modification), true, ctx) + .get_rel_size(rel, Version::Modified(modification), ctx) .await? }; @@ -1650,14 +1644,14 @@ async fn get_relsize( ) -> anyhow::Result { let nblocks = if !modification .tline - .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .get_rel_exists(rel, Version::Modified(modification), ctx) .await? { 0 } else { modification .tline - .get_rel_size(rel, Version::Modified(modification), true, ctx) + .get_rel_size(rel, Version::Modified(modification), ctx) .await? }; Ok(nblocks) @@ -1732,29 +1726,29 @@ mod tests { // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await .is_err()); assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, 1 ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) .await?, 3 ); @@ -1762,46 +1756,46 @@ mod tests { // Check page contents at each LSN assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx) .await?, test_img("foo blk 0 at 2") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx) .await?, test_img("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx) .await?, test_img("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) .await?, test_img("foo blk 2 at 5") ); @@ -1817,19 +1811,19 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx) .await?, 2 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx) .await?, test_img("foo blk 1 at 4") ); @@ -1837,13 +1831,13 @@ mod tests { // should still see the truncated block with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) .await?, 3 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) .await?, test_img("foo blk 2 at 5") ); @@ -1856,7 +1850,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx) .await?, 0 ); @@ -1869,19 +1863,19 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx) .await?, 2 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx) .await?, ZERO_PAGE ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx) .await?, test_img("foo blk 1") ); @@ -1894,21 +1888,21 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) .await?, 1501 ); for blk in 2..1500 { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx) .await?, ZERO_PAGE ); } assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx) .await?, test_img("foo blk 1500") ); @@ -1935,13 +1929,13 @@ mod tests { // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, 1 ); @@ -1954,7 +1948,7 @@ mod tests { // Check that rel is not visible anymore assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx) .await?, false ); @@ -1972,13 +1966,13 @@ mod tests { // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx) .await?, 1 ); @@ -2011,24 +2005,24 @@ mod tests { // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await .is_err()); assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, relsize ); @@ -2039,7 +2033,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx) .await?, test_img(&data) ); @@ -2056,7 +2050,7 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx) .await?, 1 ); @@ -2066,7 +2060,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx) .await?, test_img(&data) ); @@ -2075,7 +2069,7 @@ mod tests { // should still see all blocks with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) .await?, relsize ); @@ -2084,7 +2078,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx) .await?, test_img(&data) ); @@ -2104,13 +2098,13 @@ mod tests { assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) .await?, relsize ); @@ -2120,7 +2114,7 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx) .await?, test_img(&data) ); @@ -2154,7 +2148,7 @@ mod tests { assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE + 1 ); @@ -2168,7 +2162,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE ); @@ -2183,7 +2177,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE - 1 ); @@ -2201,7 +2195,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, size as BlockNumber ); diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index ae2d996879..02f6f49694 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -55,6 +55,7 @@ impl NeonWalRecord { /// Does replaying this WAL record initialize the page from scratch, or does /// it need to be applied over the previous image of the page? pub fn will_init(&self) -> bool { + // If you change this function, you'll also need to change ValueBytes::will_init match self { NeonWalRecord::Postgres { will_init, rec: _ } => *will_init, diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index ca41a576fd..9776d4ce88 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -20,6 +20,7 @@ /// Process lifecycle and abstracction for the IPC protocol. mod process; +pub use process::Kind as ProcessKind; /// Code to apply [`NeonWalRecord`]s. pub(crate) mod apply_neon; @@ -34,7 +35,7 @@ use crate::walrecord::NeonWalRecord; use anyhow::Context; use bytes::{Bytes, BytesMut}; use pageserver_api::key::key_to_rel_block; -use pageserver_api::models::WalRedoManagerStatus; +use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; use pageserver_api::shard::TenantShardId; use std::sync::Arc; use std::time::Duration; @@ -54,7 +55,7 @@ pub struct PostgresRedoManager { tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, - /// The current [`process::WalRedoProcess`] that is used by new redo requests. + /// The current [`process::Process`] that is used by new redo requests. /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the /// their process object; we use [`Arc::clone`] for that. @@ -66,7 +67,7 @@ pub struct PostgresRedoManager { /// still be using the old redo process. But, those other tasks will most likely /// encounter an error as well, and errors are an unexpected condition anyway. /// So, probably we could get rid of the `Arc` in the future. - redo_process: heavier_once_cell::OnceCell>, + redo_process: heavier_once_cell::OnceCell>, } /// @@ -139,8 +140,8 @@ impl PostgresRedoManager { } } - pub(crate) fn status(&self) -> Option { - Some(WalRedoManagerStatus { + pub fn status(&self) -> WalRedoManagerStatus { + WalRedoManagerStatus { last_redo_at: { let at = *self.last_redo_at.lock().unwrap(); at.and_then(|at| { @@ -149,8 +150,14 @@ impl PostgresRedoManager { chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?) }) }, - pid: self.redo_process.get().map(|p| p.id()), - }) + process: self + .redo_process + .get() + .map(|p| WalRedoManagerProcessStatus { + pid: p.id(), + kind: std::borrow::Cow::Borrowed(p.kind().into()), + }), + } } } @@ -208,37 +215,33 @@ impl PostgresRedoManager { const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { - let proc: Arc = - match self.redo_process.get_or_init_detached().await { - Ok(guard) => Arc::clone(&guard), - Err(permit) => { - // don't hold poison_guard, the launch code can bail - let start = Instant::now(); - let proc = Arc::new( - process::WalRedoProcess::launch( - self.conf, - self.tenant_shard_id, - pg_version, - ) + let proc: Arc = match self.redo_process.get_or_init_detached().await { + Ok(guard) => Arc::clone(&guard), + Err(permit) => { + // don't hold poison_guard, the launch code can bail + let start = Instant::now(); + let proc = Arc::new( + process::Process::launch(self.conf, self.tenant_shard_id, pg_version) .context("launch walredo process")?, - ); - let duration = start.elapsed(); - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); - info!( - duration_ms = duration.as_millis(), - pid = proc.id(), - "launched walredo process" - ); - self.redo_process.set(Arc::clone(&proc), permit); - proc - } - }; + ); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + duration_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); + self.redo_process.set(Arc::clone(&proc), permit); + proc + } + }; let started_at = std::time::Instant::now(); // Relational WAL records are applied using wal-redo-postgres let result = proc .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout) + .await .context("apply_wal_records"); let duration = started_at.elapsed(); diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index bcbb263663..ad6b4e5fe9 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -1,186 +1,67 @@ -use self::no_leak_child::NoLeakChild; -use crate::{ - config::PageServerConf, - metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, - walrecord::NeonWalRecord, -}; -use anyhow::Context; +use std::time::Duration; + use bytes::Bytes; -use nix::poll::{PollFd, PollFlags}; use pageserver_api::{reltag::RelTag, shard::TenantShardId}; -use postgres_ffi::BLCKSZ; -use std::os::fd::AsRawFd; -#[cfg(feature = "testing")] -use std::sync::atomic::AtomicUsize; -use std::{ - collections::VecDeque, - io::{Read, Write}, - process::{ChildStdin, ChildStdout, Command, Stdio}, - sync::{Mutex, MutexGuard}, - time::Duration, -}; -use tracing::{debug, error, instrument, Instrument}; -use utils::{lsn::Lsn, nonblock::set_nonblock}; +use utils::lsn::Lsn; + +use crate::{config::PageServerConf, walrecord::NeonWalRecord}; mod no_leak_child; /// The IPC protocol that pageserver and walredo process speak over their shared pipe. mod protocol; -pub struct WalRedoProcess { - #[allow(dead_code)] - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - // Some() on construction, only becomes None on Drop. - child: Option, - stdout: Mutex, - stdin: Mutex, - /// Counter to separate same sized walredo inputs failing at the same millisecond. - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize, +mod process_impl { + pub(super) mod process_async; + pub(super) mod process_std; } -struct ProcessInput { - stdin: ChildStdin, - n_requests: usize, +#[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + strum_macros::EnumString, + strum_macros::Display, + strum_macros::IntoStaticStr, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +#[repr(u8)] +pub enum Kind { + Sync, + Async, } -struct ProcessOutput { - stdout: ChildStdout, - pending_responses: VecDeque>, - n_processed_responses: usize, +pub(crate) enum Process { + Sync(process_impl::process_std::WalRedoProcess), + Async(process_impl::process_async::WalRedoProcess), } -impl WalRedoProcess { - // - // Start postgres binary in special WAL redo mode. - // - #[instrument(skip_all,fields(pg_version=pg_version))] - pub(crate) fn launch( +impl Process { + #[inline(always)] + pub fn launch( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, pg_version: u32, ) -> anyhow::Result { - crate::span::debug_assert_current_span_has_tenant_id(); - - let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. - let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; - - use no_leak_child::NoLeakChildCommandExt; - // Start postgres itself - let child = Command::new(pg_bin_dir_path.join("postgres")) - // the first arg must be --wal-redo so the child process enters into walredo mode - .arg("--wal-redo") - // the child doesn't process this arg, but, having it in the argv helps indentify the - // walredo process for a particular tenant when debugging a pagserver - .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) - .stdin(Stdio::piped()) - .stderr(Stdio::piped()) - .stdout(Stdio::piped()) - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir_path) - .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) - // NB: The redo process is not trusted after we sent it the first - // walredo work. Before that, it is trusted. Specifically, we trust - // it to - // 1. close all file descriptors except stdin, stdout, stderr because - // pageserver might not be 100% diligent in setting FD_CLOEXEC on all - // the files it opens, and - // 2. to use seccomp to sandbox itself before processing the first - // walredo request. - .spawn_no_leak_child(tenant_shard_id) - .context("spawn process")?; - WAL_REDO_PROCESS_COUNTERS.started.inc(); - let mut child = scopeguard::guard(child, |child| { - error!("killing wal-redo-postgres process due to a problem during launch"); - child.kill_and_wait(WalRedoKillCause::Startup); - }); - - let stdin = child.stdin.take().unwrap(); - let stdout = child.stdout.take().unwrap(); - let stderr = child.stderr.take().unwrap(); - let stderr = tokio::process::ChildStderr::from_std(stderr) - .context("convert to tokio::ChildStderr")?; - macro_rules! set_nonblock_or_log_err { - ($file:ident) => {{ - let res = set_nonblock($file.as_raw_fd()); - if let Err(e) = &res { - error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed"); - } - res - }}; - } - set_nonblock_or_log_err!(stdin)?; - set_nonblock_or_log_err!(stdout)?; - - // all fallible operations post-spawn are complete, so get rid of the guard - let child = scopeguard::ScopeGuard::into_inner(child); - - tokio::spawn( - async move { - scopeguard::defer! { - debug!("wal-redo-postgres stderr_logger_task finished"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); - } - debug!("wal-redo-postgres stderr_logger_task started"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); - - use tokio::io::AsyncBufReadExt; - let mut stderr_lines = tokio::io::BufReader::new(stderr); - let mut buf = Vec::new(); - let res = loop { - buf.clear(); - // TODO we don't trust the process to cap its stderr length. - // Currently it can do unbounded Vec allocation. - match stderr_lines.read_until(b'\n', &mut buf).await { - Ok(0) => break Ok(()), // eof - Ok(num_bytes) => { - let output = String::from_utf8_lossy(&buf[..num_bytes]); - error!(%output, "received output"); - } - Err(e) => { - break Err(e); - } - } - }; - match res { - Ok(()) => (), - Err(e) => { - error!(error=?e, "failed to read from walredo stderr"); - } - } - }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) - ); - - Ok(Self { - conf, - tenant_shard_id, - child: Some(child), - stdin: Mutex::new(ProcessInput { - stdin, - n_requests: 0, - }), - stdout: Mutex::new(ProcessOutput { - stdout, - pending_responses: VecDeque::new(), - n_processed_responses: 0, - }), - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize::default(), + Ok(match conf.walredo_process_kind { + Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch( + conf, + tenant_shard_id, + pg_version, + )?), + Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch( + conf, + tenant_shard_id, + pg_version, + )?), }) } - pub(crate) fn id(&self) -> u32 { - self.child - .as_ref() - .expect("must not call this during Drop") - .id() - } - - // Apply given WAL records ('records') over an old page image. Returns - // new page image. - // - #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] - pub(crate) fn apply_wal_records( + #[inline(always)] + pub(crate) async fn apply_wal_records( &self, rel: RelTag, blknum: u32, @@ -188,221 +69,29 @@ impl WalRedoProcess { records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> anyhow::Result { - let tag = protocol::BufferTag { rel, blknum }; - let input = self.stdin.lock().unwrap(); - - // Serialize all the messages to send the WAL redo process first. - // - // This could be problematic if there are millions of records to replay, - // but in practice the number of records is usually so small that it doesn't - // matter, and it's better to keep this code simple. - // - // Most requests start with a before-image with BLCKSZ bytes, followed by - // by some other WAL records. Start with a buffer that can hold that - // comfortably. - let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); - protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); - if let Some(img) = base_img { - protocol::build_push_page_msg(tag, img, &mut writebuf); - } - for (lsn, rec) in records.iter() { - if let NeonWalRecord::Postgres { - will_init: _, - rec: postgres_rec, - } = rec - { - protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); - } else { - anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); + match self { + Process::Sync(p) => { + p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) + .await } - } - protocol::build_get_page_msg(tag, &mut writebuf); - WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); - - let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout); - - if res.is_err() { - // not all of these can be caused by this particular input, however these are so rare - // in tests so capture all. - self.record_and_log(&writebuf); - } - - res - } - - fn apply_wal_records0( - &self, - writebuf: &[u8], - input: MutexGuard, - wal_redo_timeout: Duration, - ) -> anyhow::Result { - let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small. - let mut nwrite = 0usize; - - while nwrite < writebuf.len() { - let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)]; - let n = loop { - match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) { - Err(nix::errno::Errno::EINTR) => continue, - res => break res, - } - }?; - - if n == 0 { - anyhow::bail!("WAL redo timed out"); + Process::Async(p) => { + p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) + .await } - - // If 'stdin' is writeable, do write. - let in_revents = stdin_pollfds[0].revents().unwrap(); - if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() { - nwrite += proc.stdin.write(&writebuf[nwrite..])?; - } - if in_revents.contains(PollFlags::POLLHUP) { - // We still have more data to write, but the process closed the pipe. - anyhow::bail!("WAL redo process closed its stdin unexpectedly"); - } - } - let request_no = proc.n_requests; - proc.n_requests += 1; - drop(proc); - - // To improve walredo performance we separate sending requests and receiving - // responses. Them are protected by different mutexes (output and input). - // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process - // then there is not warranty that T1 will first granted output mutex lock. - // To address this issue we maintain number of sent requests, number of processed - // responses and ring buffer with pending responses. After sending response - // (under input mutex), threads remembers request number. Then it releases - // input mutex, locks output mutex and fetch in ring buffer all responses until - // its stored request number. The it takes correspondent element from - // pending responses ring buffer and truncate all empty elements from the front, - // advancing processed responses number. - - let mut output = self.stdout.lock().unwrap(); - let n_processed_responses = output.n_processed_responses; - while n_processed_responses + output.pending_responses.len() <= request_no { - // We expect the WAL redo process to respond with an 8k page image. We read it - // into this buffer. - let mut resultbuf = vec![0; BLCKSZ.into()]; - let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far - while nresult < BLCKSZ.into() { - let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)]; - // We do two things simultaneously: reading response from stdout - // and forward any logging information that the child writes to its stderr to the page server's log. - let n = loop { - match nix::poll::poll( - &mut stdout_pollfds[..], - wal_redo_timeout.as_millis() as i32, - ) { - Err(nix::errno::Errno::EINTR) => continue, - res => break res, - } - }?; - - if n == 0 { - anyhow::bail!("WAL redo timed out"); - } - - // If we have some data in stdout, read it to the result buffer. - let out_revents = stdout_pollfds[0].revents().unwrap(); - if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { - nresult += output.stdout.read(&mut resultbuf[nresult..])?; - } - if out_revents.contains(PollFlags::POLLHUP) { - anyhow::bail!("WAL redo process closed its stdout unexpectedly"); - } - } - output - .pending_responses - .push_back(Some(Bytes::from(resultbuf))); - } - // Replace our request's response with None in `pending_responses`. - // Then make space in the ring buffer by clearing out any seqence of contiguous - // `None`'s from the front of `pending_responses`. - // NB: We can't pop_front() because other requests' responses because another - // requester might have grabbed the output mutex before us: - // T1: grab input mutex - // T1: send request_no 23 - // T1: release input mutex - // T2: grab input mutex - // T2: send request_no 24 - // T2: release input mutex - // T2: grab output mutex - // T2: n_processed_responses + output.pending_responses.len() <= request_no - // 23 0 24 - // T2: enters poll loop that reads stdout - // T2: put response for 23 into pending_responses - // T2: put response for 24 into pending_resposnes - // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back - // T2: takes its response_24 - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: releases output mutex - // T1: grabs output mutex - // T1: n_processed_responses + output.pending_responses.len() > request_no - // 23 2 23 - // T1: skips poll loop that reads stdout - // T1: takes its response_23 - // pending_responses now looks like this: Front None None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Back - // n_processed_responses now has value 25 - let res = output.pending_responses[request_no - n_processed_responses] - .take() - .expect("we own this request_no, nobody else is supposed to take it"); - while let Some(front) = output.pending_responses.front() { - if front.is_none() { - output.pending_responses.pop_front(); - output.n_processed_responses += 1; - } else { - break; - } - } - Ok(res) - } - - #[cfg(feature = "testing")] - fn record_and_log(&self, writebuf: &[u8]) { - use std::sync::atomic::Ordering; - - let millis = std::time::SystemTime::now() - .duration_since(std::time::SystemTime::UNIX_EPOCH) - .unwrap() - .as_millis(); - - let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); - - // these files will be collected to an allure report - let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - - let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); - - let res = std::fs::OpenOptions::new() - .write(true) - .create_new(true) - .read(true) - .open(path) - .and_then(|mut f| f.write_all(writebuf)); - - // trip up allowed_errors - if let Err(e) = res { - tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); - } else { - tracing::error!(filename, "erroring walredo input saved"); } } - #[cfg(not(feature = "testing"))] - fn record_and_log(&self, _: &[u8]) {} -} + pub(crate) fn id(&self) -> u32 { + match self { + Process::Sync(p) => p.id(), + Process::Async(p) => p.id(), + } + } -impl Drop for WalRedoProcess { - fn drop(&mut self) { - self.child - .take() - .expect("we only do this once") - .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); - // no way to wait for stderr_logger_task from Drop because that is async only + pub(crate) fn kind(&self) -> Kind { + match self { + Process::Sync(_) => Kind::Sync, + Process::Async(_) => Kind::Async, + } } } diff --git a/pageserver/src/walredo/process/process_impl/process_async.rs b/pageserver/src/walredo/process/process_impl/process_async.rs new file mode 100644 index 0000000000..262858b033 --- /dev/null +++ b/pageserver/src/walredo/process/process_impl/process_async.rs @@ -0,0 +1,374 @@ +use self::no_leak_child::NoLeakChild; +use crate::{ + config::PageServerConf, + metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, + walrecord::NeonWalRecord, + walredo::process::{no_leak_child, protocol}, +}; +use anyhow::Context; +use bytes::Bytes; +use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use postgres_ffi::BLCKSZ; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::{ + collections::VecDeque, + process::{Command, Stdio}, + time::Duration, +}; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tracing::{debug, error, instrument, Instrument}; +use utils::{lsn::Lsn, poison::Poison}; + +pub struct WalRedoProcess { + #[allow(dead_code)] + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + // Some() on construction, only becomes None on Drop. + child: Option, + stdout: tokio::sync::Mutex>, + stdin: tokio::sync::Mutex>, + /// Counter to separate same sized walredo inputs failing at the same millisecond. + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize, +} + +struct ProcessInput { + stdin: tokio::process::ChildStdin, + n_requests: usize, +} + +struct ProcessOutput { + stdout: tokio::process::ChildStdout, + pending_responses: VecDeque>, + n_processed_responses: usize, +} + +impl WalRedoProcess { + // + // Start postgres binary in special WAL redo mode. + // + #[instrument(skip_all,fields(pg_version=pg_version))] + pub(crate) fn launch( + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + pg_version: u32, + ) -> anyhow::Result { + crate::span::debug_assert_current_span_has_tenant_id(); + + let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. + let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; + + use no_leak_child::NoLeakChildCommandExt; + // Start postgres itself + let child = Command::new(pg_bin_dir_path.join("postgres")) + // the first arg must be --wal-redo so the child process enters into walredo mode + .arg("--wal-redo") + // the child doesn't process this arg, but, having it in the argv helps indentify the + // walredo process for a particular tenant when debugging a pagserver + .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) + .stdin(Stdio::piped()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) + // NB: The redo process is not trusted after we sent it the first + // walredo work. Before that, it is trusted. Specifically, we trust + // it to + // 1. close all file descriptors except stdin, stdout, stderr because + // pageserver might not be 100% diligent in setting FD_CLOEXEC on all + // the files it opens, and + // 2. to use seccomp to sandbox itself before processing the first + // walredo request. + .spawn_no_leak_child(tenant_shard_id) + .context("spawn process")?; + WAL_REDO_PROCESS_COUNTERS.started.inc(); + let mut child = scopeguard::guard(child, |child| { + error!("killing wal-redo-postgres process due to a problem during launch"); + child.kill_and_wait(WalRedoKillCause::Startup); + }); + + let stdin = child.stdin.take().unwrap(); + let stdout = child.stdout.take().unwrap(); + let stderr = child.stderr.take().unwrap(); + let stderr = tokio::process::ChildStderr::from_std(stderr) + .context("convert to tokio::ChildStderr")?; + let stdin = + tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?; + let stdout = tokio::process::ChildStdout::from_std(stdout) + .context("convert to tokio::ChildStdout")?; + + // all fallible operations post-spawn are complete, so get rid of the guard + let child = scopeguard::ScopeGuard::into_inner(child); + + tokio::spawn( + async move { + scopeguard::defer! { + debug!("wal-redo-postgres stderr_logger_task finished"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); + } + debug!("wal-redo-postgres stderr_logger_task started"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); + + use tokio::io::AsyncBufReadExt; + let mut stderr_lines = tokio::io::BufReader::new(stderr); + let mut buf = Vec::new(); + let res = loop { + buf.clear(); + // TODO we don't trust the process to cap its stderr length. + // Currently it can do unbounded Vec allocation. + match stderr_lines.read_until(b'\n', &mut buf).await { + Ok(0) => break Ok(()), // eof + Ok(num_bytes) => { + let output = String::from_utf8_lossy(&buf[..num_bytes]); + error!(%output, "received output"); + } + Err(e) => { + break Err(e); + } + } + }; + match res { + Ok(()) => (), + Err(e) => { + error!(error=?e, "failed to read from walredo stderr"); + } + } + }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) + ); + + Ok(Self { + conf, + tenant_shard_id, + child: Some(child), + stdin: tokio::sync::Mutex::new(Poison::new( + "stdin", + ProcessInput { + stdin, + n_requests: 0, + }, + )), + stdout: tokio::sync::Mutex::new(Poison::new( + "stdout", + ProcessOutput { + stdout, + pending_responses: VecDeque::new(), + n_processed_responses: 0, + }, + )), + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize::default(), + }) + } + + pub(crate) fn id(&self) -> u32 { + self.child + .as_ref() + .expect("must not call this during Drop") + .id() + } + + /// Apply given WAL records ('records') over an old page image. Returns + /// new page image. + /// + /// # Cancel-Safety + /// + /// Cancellation safe. + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] + pub(crate) async fn apply_wal_records( + &self, + rel: RelTag, + blknum: u32, + base_img: &Option, + records: &[(Lsn, NeonWalRecord)], + wal_redo_timeout: Duration, + ) -> anyhow::Result { + let tag = protocol::BufferTag { rel, blknum }; + + // Serialize all the messages to send the WAL redo process first. + // + // This could be problematic if there are millions of records to replay, + // but in practice the number of records is usually so small that it doesn't + // matter, and it's better to keep this code simple. + // + // Most requests start with a before-image with BLCKSZ bytes, followed by + // by some other WAL records. Start with a buffer that can hold that + // comfortably. + let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); + protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); + if let Some(img) = base_img { + protocol::build_push_page_msg(tag, img, &mut writebuf); + } + for (lsn, rec) in records.iter() { + if let NeonWalRecord::Postgres { + will_init: _, + rec: postgres_rec, + } = rec + { + protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); + } else { + anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); + } + } + protocol::build_get_page_msg(tag, &mut writebuf); + WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); + + let Ok(res) = + tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await + else { + anyhow::bail!("WAL redo timed out"); + }; + + if res.is_err() { + // not all of these can be caused by this particular input, however these are so rare + // in tests so capture all. + self.record_and_log(&writebuf); + } + + res + } + + /// # Cancel-Safety + /// + /// When not polled to completion (e.g. because in `tokio::select!` another + /// branch becomes ready before this future), concurrent and subsequent + /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls. + /// Dispose of this process instance and create a new one. + async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result { + let request_no = { + let mut lock_guard = self.stdin.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let input = poison_guard.data_mut(); + input + .stdin + .write_all(writebuf) + .await + .context("write to walredo stdin")?; + let request_no = input.n_requests; + input.n_requests += 1; + poison_guard.disarm(); + request_no + }; + + // To improve walredo performance we separate sending requests and receiving + // responses. Them are protected by different mutexes (output and input). + // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process + // then there is not warranty that T1 will first granted output mutex lock. + // To address this issue we maintain number of sent requests, number of processed + // responses and ring buffer with pending responses. After sending response + // (under input mutex), threads remembers request number. Then it releases + // input mutex, locks output mutex and fetch in ring buffer all responses until + // its stored request number. The it takes correspondent element from + // pending responses ring buffer and truncate all empty elements from the front, + // advancing processed responses number. + + let mut lock_guard = self.stdout.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let output = poison_guard.data_mut(); + let n_processed_responses = output.n_processed_responses; + while n_processed_responses + output.pending_responses.len() <= request_no { + // We expect the WAL redo process to respond with an 8k page image. We read it + // into this buffer. + let mut resultbuf = vec![0; BLCKSZ.into()]; + output + .stdout + .read_exact(&mut resultbuf) + .await + .context("read walredo stdout")?; + output + .pending_responses + .push_back(Some(Bytes::from(resultbuf))); + } + // Replace our request's response with None in `pending_responses`. + // Then make space in the ring buffer by clearing out any seqence of contiguous + // `None`'s from the front of `pending_responses`. + // NB: We can't pop_front() because other requests' responses because another + // requester might have grabbed the output mutex before us: + // T1: grab input mutex + // T1: send request_no 23 + // T1: release input mutex + // T2: grab input mutex + // T2: send request_no 24 + // T2: release input mutex + // T2: grab output mutex + // T2: n_processed_responses + output.pending_responses.len() <= request_no + // 23 0 24 + // T2: enters poll loop that reads stdout + // T2: put response for 23 into pending_responses + // T2: put response for 24 into pending_resposnes + // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back + // T2: takes its response_24 + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: releases output mutex + // T1: grabs output mutex + // T1: n_processed_responses + output.pending_responses.len() > request_no + // 23 2 23 + // T1: skips poll loop that reads stdout + // T1: takes its response_23 + // pending_responses now looks like this: Front None None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Back + // n_processed_responses now has value 25 + let res = output.pending_responses[request_no - n_processed_responses] + .take() + .expect("we own this request_no, nobody else is supposed to take it"); + while let Some(front) = output.pending_responses.front() { + if front.is_none() { + output.pending_responses.pop_front(); + output.n_processed_responses += 1; + } else { + break; + } + } + poison_guard.disarm(); + Ok(res) + } + + #[cfg(feature = "testing")] + fn record_and_log(&self, writebuf: &[u8]) { + use std::sync::atomic::Ordering; + + let millis = std::time::SystemTime::now() + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_millis(); + + let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); + + // these files will be collected to an allure report + let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); + + let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); + + use std::io::Write; + let res = std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .read(true) + .open(path) + .and_then(|mut f| f.write_all(writebuf)); + + // trip up allowed_errors + if let Err(e) = res { + tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); + } else { + tracing::error!(filename, "erroring walredo input saved"); + } + } + + #[cfg(not(feature = "testing"))] + fn record_and_log(&self, _: &[u8]) {} +} + +impl Drop for WalRedoProcess { + fn drop(&mut self) { + self.child + .take() + .expect("we only do this once") + .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); + // no way to wait for stderr_logger_task from Drop because that is async only + } +} diff --git a/pageserver/src/walredo/process/process_impl/process_std.rs b/pageserver/src/walredo/process/process_impl/process_std.rs new file mode 100644 index 0000000000..e7a6c263c9 --- /dev/null +++ b/pageserver/src/walredo/process/process_impl/process_std.rs @@ -0,0 +1,405 @@ +use self::no_leak_child::NoLeakChild; +use crate::{ + config::PageServerConf, + metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, + walrecord::NeonWalRecord, + walredo::process::{no_leak_child, protocol}, +}; +use anyhow::Context; +use bytes::Bytes; +use nix::poll::{PollFd, PollFlags}; +use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use postgres_ffi::BLCKSZ; +use std::os::fd::AsRawFd; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::{ + collections::VecDeque, + io::{Read, Write}, + process::{ChildStdin, ChildStdout, Command, Stdio}, + sync::{Mutex, MutexGuard}, + time::Duration, +}; +use tracing::{debug, error, instrument, Instrument}; +use utils::{lsn::Lsn, nonblock::set_nonblock}; + +pub struct WalRedoProcess { + #[allow(dead_code)] + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + // Some() on construction, only becomes None on Drop. + child: Option, + stdout: Mutex, + stdin: Mutex, + /// Counter to separate same sized walredo inputs failing at the same millisecond. + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize, +} + +struct ProcessInput { + stdin: ChildStdin, + n_requests: usize, +} + +struct ProcessOutput { + stdout: ChildStdout, + pending_responses: VecDeque>, + n_processed_responses: usize, +} + +impl WalRedoProcess { + // + // Start postgres binary in special WAL redo mode. + // + #[instrument(skip_all,fields(pg_version=pg_version))] + pub(crate) fn launch( + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + pg_version: u32, + ) -> anyhow::Result { + crate::span::debug_assert_current_span_has_tenant_id(); + + let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. + let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; + + use no_leak_child::NoLeakChildCommandExt; + // Start postgres itself + let child = Command::new(pg_bin_dir_path.join("postgres")) + // the first arg must be --wal-redo so the child process enters into walredo mode + .arg("--wal-redo") + // the child doesn't process this arg, but, having it in the argv helps indentify the + // walredo process for a particular tenant when debugging a pagserver + .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) + .stdin(Stdio::piped()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) + // NB: The redo process is not trusted after we sent it the first + // walredo work. Before that, it is trusted. Specifically, we trust + // it to + // 1. close all file descriptors except stdin, stdout, stderr because + // pageserver might not be 100% diligent in setting FD_CLOEXEC on all + // the files it opens, and + // 2. to use seccomp to sandbox itself before processing the first + // walredo request. + .spawn_no_leak_child(tenant_shard_id) + .context("spawn process")?; + WAL_REDO_PROCESS_COUNTERS.started.inc(); + let mut child = scopeguard::guard(child, |child| { + error!("killing wal-redo-postgres process due to a problem during launch"); + child.kill_and_wait(WalRedoKillCause::Startup); + }); + + let stdin = child.stdin.take().unwrap(); + let stdout = child.stdout.take().unwrap(); + let stderr = child.stderr.take().unwrap(); + let stderr = tokio::process::ChildStderr::from_std(stderr) + .context("convert to tokio::ChildStderr")?; + macro_rules! set_nonblock_or_log_err { + ($file:ident) => {{ + let res = set_nonblock($file.as_raw_fd()); + if let Err(e) = &res { + error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed"); + } + res + }}; + } + set_nonblock_or_log_err!(stdin)?; + set_nonblock_or_log_err!(stdout)?; + + // all fallible operations post-spawn are complete, so get rid of the guard + let child = scopeguard::ScopeGuard::into_inner(child); + + tokio::spawn( + async move { + scopeguard::defer! { + debug!("wal-redo-postgres stderr_logger_task finished"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); + } + debug!("wal-redo-postgres stderr_logger_task started"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); + + use tokio::io::AsyncBufReadExt; + let mut stderr_lines = tokio::io::BufReader::new(stderr); + let mut buf = Vec::new(); + let res = loop { + buf.clear(); + // TODO we don't trust the process to cap its stderr length. + // Currently it can do unbounded Vec allocation. + match stderr_lines.read_until(b'\n', &mut buf).await { + Ok(0) => break Ok(()), // eof + Ok(num_bytes) => { + let output = String::from_utf8_lossy(&buf[..num_bytes]); + error!(%output, "received output"); + } + Err(e) => { + break Err(e); + } + } + }; + match res { + Ok(()) => (), + Err(e) => { + error!(error=?e, "failed to read from walredo stderr"); + } + } + }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) + ); + + Ok(Self { + conf, + tenant_shard_id, + child: Some(child), + stdin: Mutex::new(ProcessInput { + stdin, + n_requests: 0, + }), + stdout: Mutex::new(ProcessOutput { + stdout, + pending_responses: VecDeque::new(), + n_processed_responses: 0, + }), + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize::default(), + }) + } + + pub(crate) fn id(&self) -> u32 { + self.child + .as_ref() + .expect("must not call this during Drop") + .id() + } + + // Apply given WAL records ('records') over an old page image. Returns + // new page image. + // + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] + pub(crate) async fn apply_wal_records( + &self, + rel: RelTag, + blknum: u32, + base_img: &Option, + records: &[(Lsn, NeonWalRecord)], + wal_redo_timeout: Duration, + ) -> anyhow::Result { + let tag = protocol::BufferTag { rel, blknum }; + let input = self.stdin.lock().unwrap(); + + // Serialize all the messages to send the WAL redo process first. + // + // This could be problematic if there are millions of records to replay, + // but in practice the number of records is usually so small that it doesn't + // matter, and it's better to keep this code simple. + // + // Most requests start with a before-image with BLCKSZ bytes, followed by + // by some other WAL records. Start with a buffer that can hold that + // comfortably. + let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); + protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); + if let Some(img) = base_img { + protocol::build_push_page_msg(tag, img, &mut writebuf); + } + for (lsn, rec) in records.iter() { + if let NeonWalRecord::Postgres { + will_init: _, + rec: postgres_rec, + } = rec + { + protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); + } else { + anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); + } + } + protocol::build_get_page_msg(tag, &mut writebuf); + WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); + + let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout); + + if res.is_err() { + // not all of these can be caused by this particular input, however these are so rare + // in tests so capture all. + self.record_and_log(&writebuf); + } + + res + } + + fn apply_wal_records0( + &self, + writebuf: &[u8], + input: MutexGuard, + wal_redo_timeout: Duration, + ) -> anyhow::Result { + let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small. + let mut nwrite = 0usize; + + while nwrite < writebuf.len() { + let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)]; + let n = loop { + match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) { + Err(nix::errno::Errno::EINTR) => continue, + res => break res, + } + }?; + + if n == 0 { + anyhow::bail!("WAL redo timed out"); + } + + // If 'stdin' is writeable, do write. + let in_revents = stdin_pollfds[0].revents().unwrap(); + if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() { + nwrite += proc.stdin.write(&writebuf[nwrite..])?; + } + if in_revents.contains(PollFlags::POLLHUP) { + // We still have more data to write, but the process closed the pipe. + anyhow::bail!("WAL redo process closed its stdin unexpectedly"); + } + } + let request_no = proc.n_requests; + proc.n_requests += 1; + drop(proc); + + // To improve walredo performance we separate sending requests and receiving + // responses. Them are protected by different mutexes (output and input). + // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process + // then there is not warranty that T1 will first granted output mutex lock. + // To address this issue we maintain number of sent requests, number of processed + // responses and ring buffer with pending responses. After sending response + // (under input mutex), threads remembers request number. Then it releases + // input mutex, locks output mutex and fetch in ring buffer all responses until + // its stored request number. The it takes correspondent element from + // pending responses ring buffer and truncate all empty elements from the front, + // advancing processed responses number. + + let mut output = self.stdout.lock().unwrap(); + let n_processed_responses = output.n_processed_responses; + while n_processed_responses + output.pending_responses.len() <= request_no { + // We expect the WAL redo process to respond with an 8k page image. We read it + // into this buffer. + let mut resultbuf = vec![0; BLCKSZ.into()]; + let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far + while nresult < BLCKSZ.into() { + let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)]; + // We do two things simultaneously: reading response from stdout + // and forward any logging information that the child writes to its stderr to the page server's log. + let n = loop { + match nix::poll::poll( + &mut stdout_pollfds[..], + wal_redo_timeout.as_millis() as i32, + ) { + Err(nix::errno::Errno::EINTR) => continue, + res => break res, + } + }?; + + if n == 0 { + anyhow::bail!("WAL redo timed out"); + } + + // If we have some data in stdout, read it to the result buffer. + let out_revents = stdout_pollfds[0].revents().unwrap(); + if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { + nresult += output.stdout.read(&mut resultbuf[nresult..])?; + } + if out_revents.contains(PollFlags::POLLHUP) { + anyhow::bail!("WAL redo process closed its stdout unexpectedly"); + } + } + output + .pending_responses + .push_back(Some(Bytes::from(resultbuf))); + } + // Replace our request's response with None in `pending_responses`. + // Then make space in the ring buffer by clearing out any seqence of contiguous + // `None`'s from the front of `pending_responses`. + // NB: We can't pop_front() because other requests' responses because another + // requester might have grabbed the output mutex before us: + // T1: grab input mutex + // T1: send request_no 23 + // T1: release input mutex + // T2: grab input mutex + // T2: send request_no 24 + // T2: release input mutex + // T2: grab output mutex + // T2: n_processed_responses + output.pending_responses.len() <= request_no + // 23 0 24 + // T2: enters poll loop that reads stdout + // T2: put response for 23 into pending_responses + // T2: put response for 24 into pending_resposnes + // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back + // T2: takes its response_24 + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: releases output mutex + // T1: grabs output mutex + // T1: n_processed_responses + output.pending_responses.len() > request_no + // 23 2 23 + // T1: skips poll loop that reads stdout + // T1: takes its response_23 + // pending_responses now looks like this: Front None None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Back + // n_processed_responses now has value 25 + let res = output.pending_responses[request_no - n_processed_responses] + .take() + .expect("we own this request_no, nobody else is supposed to take it"); + while let Some(front) = output.pending_responses.front() { + if front.is_none() { + output.pending_responses.pop_front(); + output.n_processed_responses += 1; + } else { + break; + } + } + Ok(res) + } + + #[cfg(feature = "testing")] + fn record_and_log(&self, writebuf: &[u8]) { + use std::sync::atomic::Ordering; + + let millis = std::time::SystemTime::now() + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_millis(); + + let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); + + // these files will be collected to an allure report + let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); + + let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); + + let res = std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .read(true) + .open(path) + .and_then(|mut f| f.write_all(writebuf)); + + // trip up allowed_errors + if let Err(e) = res { + tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); + } else { + tracing::error!(filename, "erroring walredo input saved"); + } + } + + #[cfg(not(feature = "testing"))] + fn record_and_log(&self, _: &[u8]) {} +} + +impl Drop for WalRedoProcess { + fn drop(&mut self) { + self.child + .take() + .expect("we only do this once") + .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); + // no way to wait for stderr_logger_task from Drop because that is async only + } +} diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 2276b4e807..b7b1e7ccbf 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -49,6 +49,8 @@ char *neon_auth_token; int readahead_buffer_size = 128; int flush_every_n_requests = 8; +int neon_protocol_version = 1; + static int n_reconnect_attempts = 0; static int max_reconnect_attempts = 60; static int stripe_size; @@ -379,7 +381,17 @@ pageserver_connect(shardno_t shard_no, int elevel) pfree(msg); return false; } - query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); + switch (neon_protocol_version) + { + case 2: + query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline); + break; + case 1: + query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); + break; + default: + elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version); + } ret = PQsendQuery(conn, query); pfree(query); if (ret != 1) @@ -440,7 +452,7 @@ pageserver_connect(shardno_t shard_no, int elevel) return false; } - neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr); + neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version); page_servers[shard_no].conn = conn; page_servers[shard_no].wes = wes; @@ -844,6 +856,16 @@ pg_init_libpagestore(void) PGC_USERSET, 0, /* no flags required */ NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL); + DefineCustomIntVariable("neon.protocol_version", + "Version of compute<->page server protocol", + NULL, + &neon_protocol_version, + 1, /* default to old protocol for now */ + 1, /* min */ + 2, /* max */ + PGC_SU_BACKEND, + 0, /* no flags required */ + NULL, NULL, NULL); relsize_hash_init(); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 44ae766f76..7709ab9d42 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -69,18 +69,33 @@ typedef enum { SLRU_MULTIXACT_OFFSETS } SlruKind; -/* - * supertype of all the Neon*Request structs below +/*-- + * supertype of all the Neon*Request structs below. * - * If 'latest' is true, we are requesting the latest page version, and 'lsn' - * is just a hint to the server that we know there are no versions of the page - * (or relation size, for exists/nblocks requests) later than the 'lsn'. + * All requests contain two LSNs: + * + * lsn: request page (or relation size, etc) at this LSN + * not_modified_since: Hint that the page hasn't been modified between + * this LSN and the request LSN (`lsn`). + * + * To request the latest version of a page, you can use MAX_LSN as the request + * LSN. + * + * If you don't know any better, you can always set 'not_modified_since' equal + * to 'lsn', but providing a lower value can speed up processing the request + * in the pageserver, as it doesn't need to wait for the WAL to arrive, and it + * can skip traversing through recent layers which we know to not contain any + * versions for the requested page. + * + * These structs describe the V2 of these requests. The old V1 protocol contained + * just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is + * set to 1, we will convert these to the V1 requests before sending. */ typedef struct { NeonMessageTag tag; - bool latest; /* if true, request latest page version */ - XLogRecPtr lsn; /* request page version @ this LSN */ + XLogRecPtr lsn; + XLogRecPtr not_modified_since; } NeonRequest; typedef struct @@ -193,6 +208,7 @@ extern int readahead_buffer_size; extern char *neon_timeline; extern char *neon_tenant; extern int32 max_cluster_size; +extern int neon_protocol_version; extern shardno_t get_shard_number(BufferTag* tag); @@ -225,14 +241,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); + XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer); extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); #else extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, void *buffer); + XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer); extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 57a16e00ca..44ecdbd9aa 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -168,8 +168,8 @@ typedef enum PrefetchStatus typedef struct PrefetchRequest { BufferTag buftag; /* must be first entry in the struct */ - XLogRecPtr effective_request_lsn; - XLogRecPtr actual_request_lsn; + XLogRecPtr request_lsn; + XLogRecPtr not_modified_since; NeonResponse *response; /* may be null */ PrefetchStatus status; shardno_t shard_no; @@ -269,19 +269,19 @@ static PrefetchState *MyPState; ) \ ) -static XLogRecPtr prefetch_lsn = 0; - static bool compact_prefetch_buffers(void); static void consume_prefetch_responses(void); -static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn); +static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since); static bool prefetch_read(PrefetchRequest *slot); -static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn); +static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since); static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); -static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, - ForkNumber forknum, BlockNumber blkno); +static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since); +static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since, + PrefetchRequest *slot); static bool compact_prefetch_buffers(void) @@ -338,8 +338,8 @@ compact_prefetch_buffers(void) target_slot->shard_no = source_slot->shard_no; target_slot->status = source_slot->status; target_slot->response = source_slot->response; - target_slot->effective_request_lsn = source_slot->effective_request_lsn; - target_slot->actual_request_lsn = source_slot->actual_request_lsn; + target_slot->request_lsn = source_slot->request_lsn; + target_slot->not_modified_since = source_slot->not_modified_since; target_slot->my_ring_index = empty_ring_index; prfh_delete(MyPState->prf_hash, source_slot); @@ -358,7 +358,8 @@ compact_prefetch_buffers(void) }; source_slot->response = NULL; source_slot->my_ring_index = 0; - source_slot->effective_request_lsn = 0; + source_slot->request_lsn = InvalidXLogRecPtr; + source_slot->not_modified_since = InvalidXLogRecPtr; /* update bookkeeping */ n_moved++; @@ -683,56 +684,39 @@ prefetch_set_unused(uint64 ring_index) compact_prefetch_buffers(); } +/* + * Send one prefetch request to the pageserver. To wait for the response, call + * prefetch_wait_for(). + */ static void -prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn) +prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since) { bool found; NeonGetPageRequest request = { .req.tag = T_NeonGetPageRequest, - .req.latest = false, - .req.lsn = 0, + /* lsn and not_modified_since are filled in below */ .rinfo = BufTagGetNRelFileInfo(slot->buftag), .forknum = slot->buftag.forkNum, .blkno = slot->buftag.blockNum, }; - if (force_lsn && force_latest) + Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL))); + + if (force_request_lsn) { - request.req.lsn = *force_lsn; - request.req.latest = *force_latest; - slot->actual_request_lsn = slot->effective_request_lsn = *force_lsn; + request.req.lsn = *force_request_lsn; + request.req.not_modified_since = *force_not_modified_since; } else { - XLogRecPtr lsn = neon_get_request_lsn( - &request.req.latest, - BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, - slot->buftag.blockNum - ); - - /* - * Note: effective_request_lsn is potentially higher than the - * requested LSN, but still correct: - * - * We know there are no changes between the actual requested LSN and - * the value of effective_request_lsn: If there were, the page would - * have been in cache and evicted between those LSN values, which then - * would have had to result in a larger request LSN for this page. - * - * It is possible that a concurrent backend loads the page, modifies - * it and then evicts it again, but the LSN of that eviction cannot be - * smaller than the current WAL insert/redo pointer, which is already - * larger than this prefetch_lsn. So in any case, that would - * invalidate this cache. - * - * The best LSN to use for effective_request_lsn would be - * XLogCtl->Insert.RedoRecPtr, but that's expensive to access. - */ - slot->actual_request_lsn = request.req.lsn = lsn; - prefetch_lsn = Max(prefetch_lsn, lsn); - slot->effective_request_lsn = prefetch_lsn; + neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, + slot->buftag.blockNum, + &request.req.lsn, + &request.req.not_modified_since); } + slot->request_lsn = request.req.lsn; + slot->not_modified_since = request.req.not_modified_since; Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_unused); @@ -749,7 +733,6 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force /* update slot state */ slot->status = PRFS_REQUESTED; - prfh_insert(MyPState->prf_hash, slot, &found); Assert(!found); } @@ -759,22 +742,25 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force * * Register that we may want the contents of BufferTag in the near future. * - * If force_latest and force_lsn are not NULL, those values are sent to the - * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure - * to fill in these values manually. + * If force_request_lsn and force_not_modified_since are not NULL, those + * values are sent to the pageserver. If they are NULL, we utilize the + * lastWrittenLsn -infrastructure to fill them in. * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. */ static uint64 -prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn) +prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, + XLogRecPtr *force_not_modified_since) { uint64 ring_index; PrefetchRequest req; PrefetchRequest *slot; PrfHashEntry *entry; + Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL))); + /* use an intermediate PrefetchRequest struct to ensure correct alignment */ req.buftag = tag; Retry: @@ -792,40 +778,19 @@ Retry: Assert(BUFFERTAGS_EQUAL(slot->buftag, tag)); /* - * If we want a specific lsn, we do not accept requests that were made - * with a potentially different LSN. + * If the caller specified a request LSN to use, only accept prefetch + * responses that satisfy that request. */ - if (force_latest && force_lsn) + if (force_request_lsn) { - /* - * if we want the latest version, any effective_request_lsn < - * request lsn is OK - */ - if (*force_latest) + if (!neon_prefetch_response_usable(*force_request_lsn, + *force_not_modified_since, slot)) { - if (*force_lsn > slot->effective_request_lsn) - { - if (!prefetch_wait_for(ring_index)) - goto Retry; - prefetch_set_unused(ring_index); - entry = NULL; - } - - } - - /* - * if we don't want the latest version, only accept requests with - * the exact same LSN - */ - else - { - if (*force_lsn != slot->effective_request_lsn) - { - if (!prefetch_wait_for(ring_index)) - goto Retry; - prefetch_set_unused(ring_index); - entry = NULL; - } + /* Wait for the old request to finish and discard it */ + if (!prefetch_wait_for(ring_index)) + goto Retry; + prefetch_set_unused(ring_index); + entry = NULL; } } @@ -921,7 +886,7 @@ Retry: slot->shard_no = get_shard_number(&tag); slot->my_ring_index = ring_index; - prefetch_do_request(slot, force_latest, force_lsn); + prefetch_do_request(slot, force_request_lsn, force_not_modified_since); Assert(slot->status == PRFS_REQUESTED); Assert(MyPState->ring_last <= ring_index && ring_index < MyPState->ring_unused); @@ -950,7 +915,7 @@ page_server_request(void const *req) BufferTag tag = {0}; shardno_t shard_no; - switch (((NeonRequest *) req)->tag) + switch (messageTag(req)) { case T_NeonExistsRequest: CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo); @@ -966,11 +931,10 @@ page_server_request(void const *req) tag.blockNum = ((NeonGetPageRequest *) req)->blkno; break; default: - neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag); + neon_log(ERROR, "Unexpected request tag: %d", messageTag(req)); } shard_no = get_shard_number(&tag); - /* * Current sharding model assumes that all metadata is present only at shard 0. * We still need to call get_shard_no() to check if shard map is up-to-date. @@ -997,8 +961,52 @@ nm_pack_request(NeonRequest *msg) StringInfoData s; initStringInfo(&s); - pq_sendbyte(&s, msg->tag); + if (neon_protocol_version >= 2) + { + pq_sendbyte(&s, msg->tag); + pq_sendint64(&s, msg->lsn); + pq_sendint64(&s, msg->not_modified_since); + } + else + { + bool latest; + XLogRecPtr lsn; + + /* + * In primary, we always request the latest page version. + */ + if (!RecoveryInProgress()) + { + latest = true; + lsn = msg->not_modified_since; + } + else + { + /* + * In the protocol V1, we cannot represent that we want to read + * page at LSN X, and we know that it hasn't been modified since + * Y. We can either use 'not_modified_lsn' as the request LSN, and + * risk getting an error if that LSN is too old and has already + * fallen out of the pageserver's GC horizon, or we can send + * 'request_lsn', causing the pageserver to possibly wait for the + * recent WAL to arrive unnecessarily. Or something in between. We + * choose to use the old LSN and risk GC errors, because that's + * what we've done historically. + */ + latest = false; + lsn = msg->not_modified_since; + } + + pq_sendbyte(&s, msg->tag); + pq_sendbyte(&s, latest); + pq_sendint64(&s, lsn); + } + + /* + * The rest of the request messages are the same between protocol V1 and + * V2 + */ switch (messageTag(msg)) { /* pagestore_client -> pagestore */ @@ -1006,8 +1014,6 @@ nm_pack_request(NeonRequest *msg) { NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); @@ -1019,8 +1025,6 @@ nm_pack_request(NeonRequest *msg) { NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); @@ -1032,8 +1036,6 @@ nm_pack_request(NeonRequest *msg) { NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, msg_req->dbNode); break; @@ -1042,8 +1044,6 @@ nm_pack_request(NeonRequest *msg) { NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); @@ -1057,8 +1057,6 @@ nm_pack_request(NeonRequest *msg) { NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendbyte(&s, msg_req->kind); pq_sendint32(&s, msg_req->segno); @@ -1209,7 +1207,7 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1222,7 +1220,7 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1236,7 +1234,7 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1247,7 +1245,7 @@ nm_to_string(NeonMessage *msg) appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1259,7 +1257,7 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"kind\": %u", msg_req->kind); appendStringInfo(&s, ", \"segno\": %u", msg_req->segno); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1531,44 +1529,38 @@ nm_adjust_lsn(XLogRecPtr lsn) /* * Return LSN for requesting pages and number of blocks from page server */ -static XLogRecPtr -neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) +static void +neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since) { - XLogRecPtr lsn; + XLogRecPtr last_written_lsn; + + last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno); + last_written_lsn = nm_adjust_lsn(last_written_lsn); + Assert(last_written_lsn != InvalidXLogRecPtr); if (RecoveryInProgress()) { - /* - * We don't know if WAL has been generated but not yet replayed, so - * we're conservative in our estimates about latest pages. - */ - *latest = false; + /* Request the page at the last replayed LSN. */ + *request_lsn = GetXLogReplayRecPtr(NULL); + *not_modified_since = last_written_lsn; + Assert(last_written_lsn <= *request_lsn); - /* - * Get the last written LSN of this page. - */ - lsn = GetLastWrittenLSN(rinfo, forknum, blkno); - lsn = nm_adjust_lsn(lsn); - - neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", - (uint32) ((lsn) >> 32), (uint32) (lsn)); + neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X", + LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since)); } else { XLogRecPtr flushlsn; /* - * Use the latest LSN that was evicted from the buffer cache. Any - * pages modified by later WAL records must still in the buffer cache, - * so our request cannot concern those. + * Use the latest LSN that was evicted from the buffer cache as the + * 'not_modified_since' hint. Any pages modified by later WAL records + * must still in the buffer cache, so our request cannot concern + * those. */ - *latest = true; - lsn = GetLastWrittenLSN(rinfo, forknum, blkno); - Assert(lsn != InvalidXLogRecPtr); neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", - (uint32) ((lsn) >> 32), (uint32) (lsn)); - - lsn = nm_adjust_lsn(lsn); + LSN_FORMAT_ARGS(last_written_lsn)); /* * Is it possible that the last-written LSN is ahead of last flush @@ -1583,16 +1575,109 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block #else flushlsn = GetFlushRecPtr(); #endif - if (lsn > flushlsn) + if (last_written_lsn > flushlsn) { neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", - (uint32) (lsn >> 32), (uint32) lsn, - (uint32) (flushlsn >> 32), (uint32) flushlsn); - XLogFlush(lsn); + LSN_FORMAT_ARGS(last_written_lsn), + LSN_FORMAT_ARGS(flushlsn)); + XLogFlush(last_written_lsn); + flushlsn = last_written_lsn; } + + /* + * Request the latest version of the page. The most up-to-date request + * LSN we could use would be the current insert LSN, but to avoid the + * overhead of looking it up, use 'flushlsn' instead. This relies on + * the assumption that if the page was modified since the last WAL + * flush, it should still be in the buffer cache, and we wouldn't be + * requesting it. + */ + *request_lsn = flushlsn; + *not_modified_since = last_written_lsn; + } +} + +/* + * neon_prefetch_response_usable -- Can a new request be satisfied by old one? + * + * This is used to check if the response to a prefetch request can be used to + * satisfy a page read now. + */ +static bool +neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since, + PrefetchRequest *slot) +{ + /* sanity check the LSN's on the old and the new request */ + Assert(request_lsn >= not_modified_since); + Assert(slot->request_lsn >= slot->not_modified_since); + Assert(slot->status != PRFS_UNUSED); + + /* + * The new request's LSN should never be older than the old one. This + * could be an Assert, except that for testing purposes, we do provide an + * interface in neon_test_utils to fetch pages at arbitary LSNs, which + * violates this. + * + * Similarly, the not_modified_since value calculated for a page should + * never move backwards. This assumption is a bit fragile; if we updated + * the last-written cache when we read in a page, for example, then it + * might. But as the code stands, it should not. + * + * (If two backends issue a request at the same time, they might race and + * calculate LSNs "out of order" with each other, but the prefetch queue + * is backend-private at the moment.) + */ + if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since) + { + ereport(LOG, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "request with unexpected LSN after prefetch"), + errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", + LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since), + LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since)))); + return false; } - return lsn; + /*--- + * Each request to the pageserver carries two LSN values: + * `not_modified_since` and `request_lsn`. The (not_modified_since, + * request_lsn] range of each request is effectively a claim that the page + * has not been modified between those LSNs. If the range of the old + * request in the queue overlaps with the new request, we know that the + * page hasn't been modified in the union of the ranges. We can use the + * response to old request to satisfy the new request in that case. For + * example: + * + * 100 500 + * Old request: +--------+ + * + * 400 800 + * New request: +--------+ + * + * The old request claims that the page was not modified between LSNs 100 + * and 500, and the second claims that it was not modified between 400 and + * 800. Together they mean that the page was not modified between 100 and + * 800. Therefore the response to the old request is also valid for the + * new request. + * + * This logic also holds at the boundary case that the old request's LSN + * matches the new request's not_modified_since LSN exactly: + * + * 100 500 + * Old request: +--------+ + * + * 500 900 + * New request: +--------+ + * + * The response to the old request is the page as it was at LSN 500, and + * the page hasn't been changed in the range (500, 900], therefore the + * response is valid also for the new request. + */ + + /* this follows from the checks above */ + Assert(request_lsn >= slot->not_modified_since); + + return not_modified_since <= slot->request_lsn; } /* @@ -1604,8 +1689,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) bool exists; NeonResponse *resp; BlockNumber n_blocks; - bool latest; XLogRecPtr request_lsn; + XLogRecPtr not_modified_since; switch (reln->smgr_relpersistence) { @@ -1660,12 +1745,13 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO, + &request_lsn, ¬_modified_since); { NeonExistsRequest request = { .req.tag = T_NeonExistsRequest, - .req.latest = latest, .req.lsn = request_lsn, + .req.not_modified_since = not_modified_since, .rinfo = InfoFromSMgrRel(reln), .forknum = forkNum}; @@ -2102,10 +2188,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, void #if PG_MAJORVERSION_NUM < 16 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer) + XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer) #else neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, void *buffer) + XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer) #endif { NeonResponse *resp; @@ -2148,15 +2234,16 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (entry != NULL) { slot = entry->slot; - if (slot->effective_request_lsn >= request_lsn) + if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot)) { ring_index = slot->my_ring_index; pgBufferUsage.prefetch.hits += 1; } - else /* the current prefetch LSN is not large - * enough, so drop the prefetch */ + else { /* + * Cannot use this prefetch, discard it + * * We can't drop cache for not-yet-received requested items. It is * unlikely this happens, but it can happen if prefetch distance * is large enough and a backend didn't consume all prefetch @@ -2181,8 +2268,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { pgBufferUsage.prefetch.misses += 1; - ring_index = prefetch_register_buffer(buftag, &request_latest, - &request_lsn); + ring_index = prefetch_register_buffer(buftag, &request_lsn, + ¬_modified_since); slot = GetPrfSlot(ring_index); } else @@ -2246,8 +2333,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) #endif { - bool latest; XLogRecPtr request_lsn; + XLogRecPtr not_modified_since; switch (reln->smgr_relpersistence) { @@ -2272,8 +2359,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; } - request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, blkno); - neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, latest, buffer); + neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno, + &request_lsn, ¬_modified_since); + neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -2442,8 +2530,8 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) { NeonResponse *resp; BlockNumber n_blocks; - bool latest; XLogRecPtr request_lsn; + XLogRecPtr not_modified_since; switch (reln->smgr_relpersistence) { @@ -2470,12 +2558,13 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO, + &request_lsn, ¬_modified_since); { NeonNblocksRequest request = { .req.tag = T_NeonNblocksRequest, - .req.latest = latest, .req.lsn = request_lsn, + .req.not_modified_since = not_modified_since, .rinfo = InfoFromSMgrRel(reln), .forknum = forknum, }; @@ -2523,16 +2612,17 @@ neon_dbsize(Oid dbNode) { NeonResponse *resp; int64 db_size; - XLogRecPtr request_lsn; - bool latest; + XLogRecPtr request_lsn, + not_modified_since; NRelFileInfo dummy_node = {0}; - request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO, + &request_lsn, ¬_modified_since); { NeonDbSizeRequest request = { .req.tag = T_NeonDbSizeRequest, - .req.latest = latest, .req.lsn = request_lsn, + .req.not_modified_since = not_modified_since, .dbNode = dbNode, }; @@ -2605,7 +2695,6 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * the most recently inserted WAL record's LSN. */ lsn = GetXLogInsertRecPtr(); - lsn = nm_adjust_lsn(lsn); /* @@ -2805,14 +2894,33 @@ neon_end_unlogged_build(SMgrRelation reln) static int neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) { - XLogRecPtr request_lsn; - /* - * GetRedoStartLsn() returns LSN of basebackup. - * We need to download SLRU segments only once after node startup, - * then SLRUs are maintained locally. - */ - request_lsn = GetRedoStartLsn(); + XLogRecPtr request_lsn, + not_modified_since; + + if (RecoveryInProgress()) + { + request_lsn = GetXLogReplayRecPtr(NULL); + if (request_lsn == InvalidXLogRecPtr) + { + /* + * This happens in neon startup, we start up without replaying any + * records. + */ + request_lsn = GetRedoStartLsn(); + } + } + else + request_lsn = GetXLogInsertRecPtr(); request_lsn = nm_adjust_lsn(request_lsn); + + /* + * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU + * segment has not changed since the basebackup, because in order to + * modify it, we would have had to download it already. And once + * downloaded, we never evict SLRU segments from local disk. + */ + not_modified_since = GetRedoStartLsn(); + SlruKind kind; if (STRPREFIX(path, "pg_xact")) @@ -2827,8 +2935,8 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf NeonResponse *resp; NeonGetSlruSegmentRequest request = { .req.tag = T_NeonGetSlruSegmentRequest, - .req.latest = false, .req.lsn = request_lsn, + .req.not_modified_since = not_modified_since, .kind = kind, .segno = segno @@ -2956,6 +3064,9 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, { BlockNumber relsize; + /* This is only used in WAL replay */ + Assert(RecoveryInProgress()); + /* Extend the relation if we know its size */ if (get_cached_relsize(rinfo, forknum, &relsize)) { @@ -2974,14 +3085,13 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * This length is later reused when we open the smgr to read the * block, which is fine and expected. */ - NeonResponse *response; NeonNblocksResponse *nbresponse; NeonNblocksRequest request = { .req = (NeonRequest) { - .lsn = end_recptr, - .latest = false, .tag = T_NeonNblocksRequest, + .lsn = end_recptr, + .not_modified_since = end_recptr, }, .rinfo = rinfo, .forknum = forknum, diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile index 9c774ec185..1ee87357e5 100644 --- a/pgxn/neon_test_utils/Makefile +++ b/pgxn/neon_test_utils/Makefile @@ -7,7 +7,7 @@ OBJS = \ neontest.o EXTENSION = neon_test_utils -DATA = neon_test_utils--1.0.sql +DATA = neon_test_utils--1.1.sql PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging" PG_CONFIG = pg_config diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.1.sql similarity index 89% rename from pgxn/neon_test_utils/neon_test_utils--1.0.sql rename to pgxn/neon_test_utils/neon_test_utils--1.1.sql index 23340e352e..534784f319 100644 --- a/pgxn/neon_test_utils/neon_test_utils--1.0.sql +++ b/pgxn/neon_test_utils/neon_test_utils--1.1.sql @@ -31,12 +31,12 @@ AS 'MODULE_PATHNAME', 'clear_buffer_cache' LANGUAGE C STRICT PARALLEL UNSAFE; -CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn) +CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn) RETURNS bytea AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn' LANGUAGE C PARALLEL UNSAFE; -CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn) +CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn) RETURNS bytea AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex' LANGUAGE C PARALLEL UNSAFE; diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control index 5219571f11..5f6d640835 100644 --- a/pgxn/neon_test_utils/neon_test_utils.control +++ b/pgxn/neon_test_utils/neon_test_utils.control @@ -1,6 +1,6 @@ # neon_test_utils extension comment = 'helpers for neon testing and debugging' -default_version = '1.0' +default_version = '1.1' module_pathname = '$libdir/neon_test_utils' relocatable = true trusted = true diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 82ce5be9f6..677006923d 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush); */ #if PG_MAJORVERSION_NUM < 16 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); + XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer); #else typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, void *buffer); + XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer); #endif static neon_read_at_lsn_type neon_read_at_lsn_ptr; @@ -299,8 +299,11 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) text *forkname; uint32 blkno; - bool request_latest = PG_ARGISNULL(3); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); + XLogRecPtr request_lsn; + XLogRecPtr not_modified_since; + + if (PG_NARGS() != 5) + elog(ERROR, "unexpected number of arguments in SQL function signature"); if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) PG_RETURN_NULL(); @@ -309,6 +312,9 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) forkname = PG_GETARG_TEXT_PP(1); blkno = PG_GETARG_UINT32(2); + request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3); + not_modified_since = PG_ARGISNULL(4) ? request_lsn : PG_GETARG_LSN(4); + if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), @@ -361,7 +367,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsn, not_modified_since, raw_page_data); relation_close(rel, AccessShareLock); @@ -380,6 +386,9 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) { char *raw_page_data; + if (PG_NARGS() != 7) + elog(ERROR, "unexpected number of arguments in SQL function signature"); + if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), @@ -403,18 +412,20 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) }; ForkNumber forknum = PG_GETARG_UINT32(3); - uint32 blkno = PG_GETARG_UINT32(4); - bool request_latest = PG_ARGISNULL(5); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); + XLogRecPtr request_lsn; + XLogRecPtr not_modified_since; /* Initialize buffer to copy to */ bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5); + not_modified_since = PG_ARGISNULL(6) ? request_lsn : PG_GETARG_LSN(6); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(rinfo, forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(rinfo, forknum, blkno, request_lsn, not_modified_since, raw_page_data); PG_RETURN_BYTEA_P(raw_page); } } diff --git a/poetry.lock b/poetry.lock index b43a3f6e93..c10e1d1e99 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2,87 +2,87 @@ [[package]] name = "aiohttp" -version = "3.9.2" +version = "3.9.4" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.8" files = [ - {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"}, - {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"}, - {file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"}, - {file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"}, - {file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"}, - {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"}, - {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"}, - {file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"}, - {file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"}, - {file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"}, - {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"}, - {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"}, - {file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"}, - {file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"}, - {file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"}, - {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"}, - {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"}, - {file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"}, - {file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"}, - {file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"}, - {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"}, - {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"}, - {file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"}, - {file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"}, - {file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"}, - {file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"}, + {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"}, + {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"}, + {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"}, + {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"}, + {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"}, + {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"}, + {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"}, + {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"}, + {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"}, + {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"}, + {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"}, + {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"}, + {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"}, + {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"}, + {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"}, + {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"}, + {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"}, + {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"}, + {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"}, + {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"}, + {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"}, + {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"}, + {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"}, + {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"}, + {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"}, + {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"}, ] [package.dependencies] @@ -1191,13 +1191,13 @@ files = [ [[package]] name = "idna" -version = "3.3" +version = "3.7" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" files = [ - {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, - {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, + {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, + {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] [[package]] @@ -2182,6 +2182,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2652,6 +2653,16 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, + {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, + {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, + {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, + {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -2889,4 +2900,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "c478ef3786bd94a0b7887518ceaf444993b77de873074102d018ac5926bc5d34" +content-hash = "b3452b50901123fd5f2c385ce8a0c1c492296393b8a7926a322b6df0ea3ac572" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index b327890be2..0e8d03906b 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -12,6 +12,7 @@ testing = [] anyhow.workspace = true async-compression.workspace = true async-trait.workspace = true +atomic-take.workspace = true aws-config.workspace = true aws-sdk-iam.workspace = true aws-sigv4.workspace = true @@ -36,10 +37,14 @@ http.workspace = true humantime.workspace = true hyper-tungstenite.workspace = true hyper.workspace = true +hyper1 = { package = "hyper", version = "1.2", features = ["server"] } +hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } +http-body-util = { version = "0.1" } ipnet.workspace = true itertools.workspace = true lasso = { workspace = true, features = ["multi-threaded"] } md5.workspace = true +measured = { workspace = true, features = ["lasso"] } metrics.workspace = true once_cell.workspace = true opentelemetry.workspace = true @@ -54,8 +59,8 @@ prometheus.workspace = true rand.workspace = true regex.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } -reqwest = { workspace = true, features = ["json"] } -reqwest-middleware.workspace = true +reqwest.workspace = true +reqwest-middleware = { workspace = true, features = ["json"] } reqwest-retry.workspace = true reqwest-tracing.workspace = true routerify.workspace = true @@ -79,6 +84,7 @@ tokio-postgres.workspace = true tokio-rustls.workspace = true tokio-util.workspace = true tokio = { workspace = true, features = ["signal"] } +tower-service.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index e421798067..3795e3b608 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -2,8 +2,15 @@ mod classic; mod hacks; mod link; +use std::net::IpAddr; +use std::sync::Arc; +use std::time::Duration; + +use ipnet::{Ipv4Net, Ipv6Net}; pub use link::LinkAuthError; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio_postgres::config::AuthKeys; +use tracing::{info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; use crate::auth::validate_password_and_exchange; @@ -13,9 +20,10 @@ use crate::console::provider::{CachedRoleSecret, ConsoleBackend}; use crate::console::{AuthSecret, NodeInfo}; use crate::context::RequestMonitoring; use crate::intern::EndpointIdInt; -use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED}; +use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; +use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo}; use crate::stream::Stream; use crate::{ auth::{self, ComputeUserInfoMaybeEndpoint}, @@ -27,10 +35,7 @@ use crate::{ }, stream, url, }; -use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; -use std::sync::Arc; -use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{info, warn}; +use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName}; /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality pub enum MaybeOwned<'a, T> { @@ -176,17 +181,51 @@ impl TryFrom for ComputeUserInfo { } } +#[derive(PartialEq, PartialOrd, Hash, Eq, Ord, Debug, Copy, Clone)] +pub struct MaskedIp(IpAddr); + +impl MaskedIp { + fn new(value: IpAddr, prefix: u8) -> Self { + match value { + IpAddr::V4(v4) => Self(IpAddr::V4( + Ipv4Net::new(v4, prefix).map_or(v4, |x| x.trunc().addr()), + )), + IpAddr::V6(v6) => Self(IpAddr::V6( + Ipv6Net::new(v6, prefix).map_or(v6, |x| x.trunc().addr()), + )), + } + } +} + +// This can't be just per IP because that would limit some PaaS that share IP addresses +pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>; + +impl RateBucketInfo { + /// All of these are per endpoint-maskedip pair. + /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). + /// + /// First bucket: 1000mcpus total per endpoint-ip pair + /// * 4096000 requests per second with 1 hash rounds. + /// * 1000 requests per second with 4096 hash rounds. + /// * 6.8 requests per second with 600000 hash rounds. + pub const DEFAULT_AUTH_SET: [Self; 3] = [ + Self::new(1000 * 4096, Duration::from_secs(1)), + Self::new(600 * 4096, Duration::from_secs(60)), + Self::new(300 * 4096, Duration::from_secs(600)), + ]; +} + impl AuthenticationConfig { pub fn check_rate_limit( &self, - ctx: &mut RequestMonitoring, + config: &AuthenticationConfig, secret: AuthSecret, endpoint: &EndpointId, is_cleartext: bool, ) -> auth::Result { // we have validated the endpoint exists, so let's intern it. - let endpoint_int = EndpointIdInt::from(endpoint); + let endpoint_int = EndpointIdInt::from(endpoint.normalize()); // only count the full hash count if password hack or websocket flow. // in other words, if proxy needs to run the hashing @@ -201,17 +240,25 @@ impl AuthenticationConfig { 1 }; - let limit_not_exceeded = self - .rate_limiter - .check((endpoint_int, ctx.peer_addr), password_weight); + let limit_not_exceeded = self.rate_limiter.check( + ( + endpoint_int, + MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet), + ), + password_weight, + ); if !limit_not_exceeded { warn!( enabled = self.rate_limiter_enabled, "rate limiting authentication" ); - AUTH_RATE_LIMIT_HITS.inc(); - ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint); + Metrics::get().proxy.requests_auth_rate_limits_total.inc(); + Metrics::get() + .proxy + .endpoints_auth_rate_limits + .get_metric() + .measure(endpoint); if self.rate_limiter_enabled { return Err(auth::AuthError::too_many_connections()); @@ -267,6 +314,7 @@ async fn auth_quirks( let secret = match secret { Some(secret) => config.check_rate_limit( ctx, + config, secret, &info.endpoint, unauthenticated_password.is_some() || allow_cleartext, @@ -469,7 +517,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> { #[cfg(test)] mod tests { - use std::sync::Arc; + use std::{net::IpAddr, sync::Arc, time::Duration}; use bytes::BytesMut; use fallible_iterator::FallibleIterator; @@ -482,7 +530,7 @@ mod tests { use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; use crate::{ - auth::{ComputeUserInfoMaybeEndpoint, IpPattern}, + auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern}, config::AuthenticationConfig, console::{ self, @@ -491,12 +539,12 @@ mod tests { }, context::RequestMonitoring, proxy::NeonOptions, - rate_limiter::{AuthRateLimiter, RateBucketInfo}, + rate_limiter::RateBucketInfo, scram::ServerSecret, stream::{PqStream, Stream}, }; - use super::auth_quirks; + use super::{auth_quirks, AuthRateLimiter}; struct Auth { ips: Vec, @@ -537,6 +585,7 @@ mod tests { scram_protocol_timeout: std::time::Duration::from_secs(5), rate_limiter_enabled: true, rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), + rate_limit_ip_subnet: 64, }); async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage { @@ -548,6 +597,51 @@ mod tests { } } + #[test] + fn masked_ip() { + let ip_a = IpAddr::V4([127, 0, 0, 1].into()); + let ip_b = IpAddr::V4([127, 0, 0, 2].into()); + let ip_c = IpAddr::V4([192, 168, 1, 101].into()); + let ip_d = IpAddr::V4([192, 168, 1, 102].into()); + let ip_e = IpAddr::V6("abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd".parse().unwrap()); + let ip_f = IpAddr::V6("abcd:abcd:abcd:abcd:1234:abcd:abcd:abcd".parse().unwrap()); + + assert_ne!(MaskedIp::new(ip_a, 64), MaskedIp::new(ip_b, 64)); + assert_ne!(MaskedIp::new(ip_a, 32), MaskedIp::new(ip_b, 32)); + assert_eq!(MaskedIp::new(ip_a, 30), MaskedIp::new(ip_b, 30)); + assert_eq!(MaskedIp::new(ip_c, 30), MaskedIp::new(ip_d, 30)); + + assert_ne!(MaskedIp::new(ip_e, 128), MaskedIp::new(ip_f, 128)); + assert_eq!(MaskedIp::new(ip_e, 64), MaskedIp::new(ip_f, 64)); + } + + #[test] + fn test_default_auth_rate_limit_set() { + // these values used to exceed u32::MAX + assert_eq!( + RateBucketInfo::DEFAULT_AUTH_SET, + [ + RateBucketInfo { + interval: Duration::from_secs(1), + max_rpi: 1000 * 4096, + }, + RateBucketInfo { + interval: Duration::from_secs(60), + max_rpi: 600 * 4096 * 60, + }, + RateBucketInfo { + interval: Duration::from_secs(600), + max_rpi: 300 * 4096 * 600, + } + ] + ); + + for x in RateBucketInfo::DEFAULT_AUTH_SET { + let y = x.to_string().parse().unwrap(); + assert_eq!(x, y); + } + } + #[tokio::test] async fn auth_quirks_scram() { let (mut client, server) = tokio::io::duplex(1024); diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 89773aa1ff..783a1a5a21 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -4,7 +4,7 @@ use crate::{ auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::{ReportableError, UserFacingError}, - metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, + metrics::{Metrics, SniKind}, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI, EndpointId, RoleName, @@ -144,21 +144,22 @@ impl ComputeUserInfoMaybeEndpoint { ctx.set_endpoint_id(ep.clone()); } + let metrics = Metrics::get(); info!(%user, "credentials"); if sni.is_some() { info!("Connection with sni"); - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["sni"]) - .inc(); + metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); } else if endpoint.is_some() { - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["no_sni"]) - .inc(); + metrics + .proxy + .accepted_connections_by_sni + .inc(SniKind::NoSni); info!("Connection without sni"); } else { - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["password_hack"]) - .inc(); + metrics + .proxy + .accepted_connections_by_sni + .inc(SniKind::PasswordHack); info!("Connection with password hack"); } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index c28814b1c8..fb16b76567 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -9,15 +9,13 @@ use futures::future::Either; use itertools::Itertools; use proxy::config::TlsServerEndPoint; use proxy::context::RequestMonitoring; -use proxy::proxy::run_until_cancelled; -use proxy::{BranchId, EndpointId, ProjectId}; +use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled}; use rustls::pki_types::PrivateKeyDer; use tokio::net::TcpListener; use anyhow::{anyhow, bail, ensure, Context}; use clap::Arg; use futures::TryFutureExt; -use proxy::console::messages::MetricsAuxInfo; use proxy::stream::{PqStream, Stream}; use tokio::io::{AsyncRead, AsyncWrite}; @@ -176,7 +174,12 @@ async fn task_main( .context("failed to set socket option")?; info!(%peer_addr, "serving"); - let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni"); + let ctx = RequestMonitoring::new( + session_id, + peer_addr.ip(), + proxy::metrics::Protocol::SniRouter, + "sni", + ); handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await } .unwrap_or_else(|e| { @@ -199,6 +202,7 @@ async fn task_main( const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; async fn ssl_handshake( + ctx: &mut RequestMonitoring, raw_stream: S, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, @@ -228,7 +232,10 @@ async fn ssl_handshake( } Ok(Stream::Tls { - tls: Box::new(raw.upgrade(tls_config).await?), + tls: Box::new( + raw.upgrade(tls_config, !ctx.has_private_peer_addr()) + .await?, + ), tls_server_end_point, }) } @@ -251,7 +258,7 @@ async fn handle_client( tls_server_end_point: TlsServerEndPoint, stream: impl AsyncRead + AsyncWrite + Unpin, ) -> anyhow::Result<()> { - let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?; + let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?; // Cut off first part of the SNI domain // We receive required destination details in the format of @@ -268,18 +275,15 @@ async fn handle_client( info!("destination: {}", destination); - let client = tokio::net::TcpStream::connect(destination).await?; - - let metrics_aux: MetricsAuxInfo = MetricsAuxInfo { - endpoint_id: (&EndpointId::from("")).into(), - project_id: (&ProjectId::from("")).into(), - branch_id: (&BranchId::from("")).into(), - cold_start_info: proxy::console::messages::ColdStartInfo::Unknown, - }; + let mut client = tokio::net::TcpStream::connect(destination).await?; // doesn't yet matter as pg-sni-router doesn't report analytics logs ctx.set_success(); - ctx.log(); + ctx.log_connect(); - proxy::proxy::passthrough::proxy_pass(tls_stream, client, metrics_aux).await + // Starting from here we only proxy the client's traffic. + info!("performing the proxy pass..."); + let _ = copy_bidirectional_client_compute(&mut tls_stream, &mut client).await?; + + Ok(()) } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 56a3ef79cd..0956aae6c0 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -7,6 +7,7 @@ use aws_config::provider_config::ProviderConfig; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; use futures::future::Either; use proxy::auth; +use proxy::auth::backend::AuthRateLimiter; use proxy::auth::backend::MaybeOwned; use proxy::cancellation::CancelMap; use proxy::cancellation::CancellationHandler; @@ -18,11 +19,10 @@ use proxy::config::ProjectInfoCacheOptions; use proxy::console; use proxy::context::parquet::ParquetUploadArgs; use proxy::http; -use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT; -use proxy::rate_limiter::AuthRateLimiter; +use proxy::http::health_server::AppMetrics; +use proxy::metrics::Metrics; use proxy::rate_limiter::EndpointRateLimiter; use proxy::rate_limiter::RateBucketInfo; -use proxy::rate_limiter::RateLimiterConfig; use proxy::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use proxy::redis::elasticache; @@ -42,6 +42,7 @@ use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::info; use tracing::warn; +use tracing::Instrument; use utils::{project_build_tag, project_git_version, sentry_init::init_sentry}; project_git_version!(GIT_VERSION); @@ -117,8 +118,11 @@ struct ProxyCliArgs { #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] wake_compute_cache: String, /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). - #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] wake_compute_lock: String, + /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] + connect_compute_lock: String, /// Allow self-signed certificates for compute nodes (for testing) #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] allow_self_signed_compute: bool, @@ -131,14 +135,8 @@ struct ProxyCliArgs { #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] require_client_ip: bool, /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour. - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] disable_dynamic_rate_limiter: bool, - /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`. - #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)] - rate_limit_algorithm: proxy::rate_limiter::RateLimitAlgorithm, - /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error. - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - rate_limiter_timeout: tokio::time::Duration, /// Endpoint rate limiter max number of requests per second. /// /// Provided in the form '@'. @@ -151,14 +149,12 @@ struct ProxyCliArgs { /// Authentication rate limiter max number of hashes per second. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] auth_rate_limit: Vec, + /// The IP subnet to use when considering whether two IP addresses are considered the same. + #[clap(long, default_value_t = 64)] + auth_rate_limit_ip_subnet: u8, /// Redis rate limiter max number of requests per second. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] redis_rps_limit: Vec, - /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`. - #[clap(long, default_value_t = 100)] - initial_limit: usize, - #[clap(flatten)] - aimd_config: proxy::rate_limiter::AimdConfig, /// cache for `allowed_ips` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] allowed_ips_cache: String, @@ -189,7 +185,9 @@ struct ProxyCliArgs { /// cache for `project_info` (use `size=0` to disable) #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] project_info_cache: String, - + /// cache for all valid endpoints + #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] + endpoint_cache_config: String, #[clap(flatten)] parquet_upload: ParquetUploadArgs, @@ -205,6 +203,12 @@ struct ProxyCliArgs { /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. #[clap(long, default_value = "4194304")] metric_backup_collection_chunk_size: usize, + /// Whether to retry the connection to the compute node + #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] + connect_to_compute_retry: String, + /// Whether to retry the wake_compute request + #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)] + wake_compute_retry: String, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -249,14 +253,18 @@ async fn main() -> anyhow::Result<()> { info!("Version: {GIT_VERSION}"); info!("Build_tag: {BUILD_TAG}"); - ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); - match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) { - Ok(t) => { - t.start(); + let jemalloc = match proxy::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None } - Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"), - } + }; let args = ProxyCliArgs::parse(); let config = build_config(&args)?; @@ -296,27 +304,27 @@ async fn main() -> anyhow::Result<()> { ), aws_credentials_provider, )); - let redis_notifications_client = - match (args.redis_notifications, (args.redis_host, args.redis_port)) { - (Some(url), _) => { - info!("Starting redis notifications listener ({url})"); - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) - } - (None, (Some(host), Some(port))) => Some( - ConnectionWithCredentialsProvider::new_with_credentials_provider( - host, - port, - elasticache_credentials_provider.clone(), - ), + let regional_redis_client = match (args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host, + port, + elasticache_credentials_provider.clone(), ), - (None, (None, None)) => { - warn!("Redis is disabled"); - None - } - _ => { - bail!("redis-host and redis-port must be specified together"); - } - }; + ), + (None, None) => { + warn!("Redis events from console are disabled"); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }; + let redis_notifications_client = if let Some(url) = args.redis_notifications { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) + } else { + regional_redis_client.clone() + }; // Check that we can bind to address before further initialization let http_address: SocketAddr = args.http.parse()?; @@ -332,11 +340,9 @@ async fn main() -> anyhow::Result<()> { let proxy_listener = TcpListener::bind(proxy_address).await?; let cancellation_token = CancellationToken::new(); - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit)); let cancel_map = CancelMap::default(); - // let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x))); - let redis_publisher = match &redis_notifications_client { + let redis_publisher = match ®ional_redis_client { Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( redis_publisher.clone(), args.region.clone(), @@ -349,7 +355,7 @@ async fn main() -> anyhow::Result<()> { >::new( cancel_map.clone(), redis_publisher, - NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT, + proxy::metrics::CancellationSource::FromClient, )); // client facing tasks. these will exit on error or on cancellation @@ -359,7 +365,6 @@ async fn main() -> anyhow::Result<()> { config, proxy_listener, cancellation_token.clone(), - endpoint_rate_limiter.clone(), cancellation_handler.clone(), )); @@ -374,7 +379,6 @@ async fn main() -> anyhow::Result<()> { config, serverless_listener, cancellation_token.clone(), - endpoint_rate_limiter.clone(), cancellation_handler.clone(), )); } @@ -387,7 +391,14 @@ async fn main() -> anyhow::Result<()> { // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone())); - maintenance_tasks.spawn(http::health_server::task_main(http_listener)); + maintenance_tasks.spawn(http::health_server::task_main( + http_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: proxy::metrics::Metrics::get(), + }, + )); maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener)); if let Some(metrics_config) = &config.metric_collection { @@ -395,21 +406,43 @@ async fn main() -> anyhow::Result<()> { maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); client_tasks.spawn(usage_metrics::task_backup( &metrics_config.backup_metric_collection_config, - cancellation_token, + cancellation_token.clone(), )); } if let auth::BackendType::Console(api, _) = &config.auth_backend { if let proxy::console::provider::ConsoleBackend::Console(api) = &**api { - if let Some(redis_notifications_client) = redis_notifications_client { - let cache = api.caches.project_info.clone(); - maintenance_tasks.spawn(notifications::task_main( - redis_notifications_client.clone(), - cache.clone(), - cancel_map.clone(), - args.region.clone(), - )); - maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); + match (redis_notifications_client, regional_redis_client.clone()) { + (None, None) => {} + (client1, client2) => { + let cache = api.caches.project_info.clone(); + if let Some(client) = client1 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + cancel_map.clone(), + args.region.clone(), + )); + } + if let Some(client) = client2 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + cancel_map.clone(), + args.region.clone(), + )); + } + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); + } + } + if let Some(regional_redis_client) = regional_redis_client { + let cache = api.caches.endpoints_cache.clone(); + let con = regional_redis_client; + let span = tracing::info_span!("endpoints_cache"); + maintenance_tasks.spawn( + async move { cache.do_read(con, cancellation_token.clone()).await } + .instrument(span), + ); } } } @@ -476,46 +509,54 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { and metric-collection-interval must be specified" ), }; - let rate_limiter_config = RateLimiterConfig { - disable: args.disable_dynamic_rate_limiter, - algorithm: args.rate_limit_algorithm, - timeout: args.rate_limiter_timeout, - initial_limit: args.initial_limit, - aimd_config: Some(args.aimd_config), - }; + if !args.disable_dynamic_rate_limiter { + bail!("dynamic rate limiter should be disabled"); + } let auth_backend = match &args.auth_backend { AuthBackend::Console => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!( "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); let caches = Box::leak(Box::new(console::caches::ApiCaches::new( wake_compute_cache_config, project_info_cache_config, + endpoint_cache_config, ))); - let config::WakeComputeLockOptions { + let config::ConcurrencyLockOptions { shards, permits, epoch, timeout, } = args.wake_compute_lock.parse()?; info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)"); - let locks = Box::leak(Box::new( - console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout) - .unwrap(), - )); - tokio::spawn(locks.garbage_collect_worker(epoch)); + let locks = Box::leak(Box::new(console::locks::ApiLocks::new( + "wake_compute_lock", + permits, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + )?)); + tokio::spawn(locks.garbage_collect_worker()); let url = args.auth_endpoint.parse()?; - let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config)); + let endpoint = http::Endpoint::new(url, http::new_client()); - let api = console::provider::neon::Api::new(endpoint, caches, locks); + let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); + RateBucketInfo::validate(&mut endpoint_rps_limit)?; + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit)); + let api = + console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter); let api = console::provider::ConsoleBackend::Console(api); auth::BackendType::Console(MaybeOwned::Owned(api), ()) } @@ -531,6 +572,23 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { auth::BackendType::Link(MaybeOwned::Owned(url), ()) } }; + + let config::ConcurrencyLockOptions { + shards, + permits, + epoch, + timeout, + } = args.connect_compute_lock.parse()?; + info!(permits, shards, ?epoch, "Using NodeLocks (connect_compute)"); + let connect_compute_locks = console::locks::ApiLocks::new( + "connect_compute_lock", + permits, + shards, + timeout, + epoch, + &Metrics::get().proxy.connect_compute_lock, + )?; + let http_config = HttpConfig { request_timeout: args.sql_over_http.sql_over_http_timeout, pool_options: GlobalConnPoolOptions { @@ -546,10 +604,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { scram_protocol_timeout: args.scram_protocol_timeout, rate_limiter_enabled: args.auth_rate_limit_enabled, rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), + rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, }; - let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); - RateBucketInfo::validate(&mut endpoint_rps_limit)?; let mut redis_rps_limit = args.redis_rps_limit.clone(); RateBucketInfo::validate(&mut redis_rps_limit)?; @@ -562,13 +619,19 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { authentication_config, require_client_ip: args.require_client_ip, disable_ip_check_for_http: args.disable_ip_check_for_http, - endpoint_rps_limit, redis_rps_limit, handshake_timeout: args.handshake_timeout, region: args.region.clone(), aws_region: args.aws_region.clone(), + wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, + connect_compute_locks, + connect_to_compute_retry_config: config::RetryConfig::parse( + &args.connect_to_compute_retry, + )?, })); + tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); + Ok(config) } diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs index fc5f416395..d1d4087241 100644 --- a/proxy/src/cache.rs +++ b/proxy/src/cache.rs @@ -1,4 +1,5 @@ pub mod common; +pub mod endpoints; pub mod project_info; mod timed_lru; diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs new file mode 100644 index 0000000000..4bc10a6020 --- /dev/null +++ b/proxy/src/cache/endpoints.rs @@ -0,0 +1,247 @@ +use std::{ + convert::Infallible, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::Duration, +}; + +use dashmap::DashSet; +use redis::{ + streams::{StreamReadOptions, StreamReadReply}, + AsyncCommands, FromRedisValue, Value, +}; +use serde::Deserialize; +use tokio::sync::Mutex; +use tokio_util::sync::CancellationToken; +use tracing::info; + +use crate::{ + config::EndpointCacheConfig, + context::RequestMonitoring, + intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}, + metrics::{Metrics, RedisErrors, RedisEventsCount}, + rate_limiter::GlobalRateLimiter, + redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider, + EndpointId, +}; + +#[derive(Deserialize, Debug, Clone)] +pub struct ControlPlaneEventKey { + endpoint_created: Option, + branch_created: Option, + project_created: Option, +} +#[derive(Deserialize, Debug, Clone)] +struct EndpointCreated { + endpoint_id: String, +} +#[derive(Deserialize, Debug, Clone)] +struct BranchCreated { + branch_id: String, +} +#[derive(Deserialize, Debug, Clone)] +struct ProjectCreated { + project_id: String, +} + +pub struct EndpointsCache { + config: EndpointCacheConfig, + endpoints: DashSet, + branches: DashSet, + projects: DashSet, + ready: AtomicBool, + limiter: Arc>, +} + +impl EndpointsCache { + pub fn new(config: EndpointCacheConfig) -> Self { + Self { + limiter: Arc::new(Mutex::new(GlobalRateLimiter::new( + config.limiter_info.clone(), + ))), + config, + endpoints: DashSet::new(), + branches: DashSet::new(), + projects: DashSet::new(), + ready: AtomicBool::new(false), + } + } + pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool { + if !self.ready.load(Ordering::Acquire) { + return true; + } + let rejected = self.should_reject(endpoint); + ctx.set_rejected(rejected); + info!(?rejected, "check endpoint is valid, disabled cache"); + // If cache is disabled, just collect the metrics and return or + // If the limiter allows, we don't need to check the cache. + if self.config.disable_cache || self.limiter.lock().await.check() { + return true; + } + !rejected + } + fn should_reject(&self, endpoint: &EndpointId) -> bool { + if endpoint.is_endpoint() { + !self.endpoints.contains(&EndpointIdInt::from(endpoint)) + } else if endpoint.is_branch() { + !self + .branches + .contains(&BranchIdInt::from(&endpoint.as_branch())) + } else { + !self + .projects + .contains(&ProjectIdInt::from(&endpoint.as_project())) + } + } + fn insert_event(&self, key: ControlPlaneEventKey) { + // Do not do normalization here, we expect the events to be normalized. + if let Some(endpoint_created) = key.endpoint_created { + self.endpoints + .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into())); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::EndpointCreated); + } + if let Some(branch_created) = key.branch_created { + self.branches + .insert(BranchIdInt::from(&branch_created.branch_id.into())); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::BranchCreated); + } + if let Some(project_created) = key.project_created { + self.projects + .insert(ProjectIdInt::from(&project_created.project_id.into())); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::ProjectCreated); + } + } + pub async fn do_read( + &self, + mut con: ConnectionWithCredentialsProvider, + cancellation_token: CancellationToken, + ) -> anyhow::Result { + let mut last_id = "0-0".to_string(); + loop { + if let Err(e) = con.connect().await { + tracing::error!("error connecting to redis: {:?}", e); + self.ready.store(false, Ordering::Release); + } + if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await { + tracing::error!("error reading from redis: {:?}", e); + self.ready.store(false, Ordering::Release); + } + if cancellation_token.is_cancelled() { + info!("cancellation token is cancelled, exiting"); + tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await; + // 1 week. + } + tokio::time::sleep(self.config.retry_interval).await; + } + } + async fn read_from_stream( + &self, + con: &mut ConnectionWithCredentialsProvider, + last_id: &mut String, + ) -> anyhow::Result<()> { + tracing::info!("reading endpoints/branches/projects from redis"); + self.batch_read( + con, + StreamReadOptions::default().count(self.config.initial_batch_size), + last_id, + true, + ) + .await?; + tracing::info!("ready to filter user requests"); + self.ready.store(true, Ordering::Release); + self.batch_read( + con, + StreamReadOptions::default() + .count(self.config.default_batch_size) + .block(self.config.xread_timeout.as_millis() as usize), + last_id, + false, + ) + .await + } + fn parse_key_value(value: &Value) -> anyhow::Result { + let s: String = FromRedisValue::from_redis_value(value)?; + Ok(serde_json::from_str(&s)?) + } + async fn batch_read( + &self, + conn: &mut ConnectionWithCredentialsProvider, + opts: StreamReadOptions, + last_id: &mut String, + return_when_finish: bool, + ) -> anyhow::Result<()> { + let mut total: usize = 0; + loop { + let mut res: StreamReadReply = conn + .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts) + .await?; + + if res.keys.is_empty() { + if return_when_finish { + if total != 0 { + break; + } + anyhow::bail!( + "Redis stream {} is empty, cannot be used to filter endpoints", + self.config.stream_name + ); + } + // If we are not returning when finish, we should wait for more data. + continue; + } + if res.keys.len() != 1 { + anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name); + } + + let res = res.keys.pop().expect("Checked length above"); + let len = res.ids.len(); + for x in res.ids { + total += 1; + for (_, v) in x.map { + let key = match Self::parse_key_value(&v) { + Ok(x) => x, + Err(e) => { + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: &self.config.stream_name, + }); + tracing::error!("error parsing value {v:?}: {e:?}"); + continue; + } + }; + self.insert_event(key); + } + if total.is_power_of_two() { + tracing::debug!("endpoints read {}", total); + } + *last_id = x.id; + } + if return_when_finish && len <= self.config.default_batch_size { + break; + } + } + tracing::info!("read {} endpoints/branches/projects from redis", total); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::ControlPlaneEventKey; + + #[test] + fn test() { + let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}"; + let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap(); + } +} diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index d8a1d261ce..10cc4ceee1 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -5,9 +5,11 @@ use std::{ time::Duration, }; +use async_trait::async_trait; use dashmap::DashMap; use rand::{thread_rng, Rng}; use smol_str::SmolStr; +use tokio::sync::Mutex; use tokio::time::Instant; use tracing::{debug, info}; @@ -21,11 +23,12 @@ use crate::{ use super::{Cache, Cached}; +#[async_trait] pub trait ProjectInfoCache { fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt); fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt); - fn enable_ttl(&self); - fn disable_ttl(&self); + async fn decrement_active_listeners(&self); + async fn increment_active_listeners(&self); } struct Entry { @@ -116,8 +119,10 @@ pub struct ProjectInfoCacheImpl { start_time: Instant, ttl_disabled_since_us: AtomicU64, + active_listeners_lock: Mutex, } +#[async_trait] impl ProjectInfoCache for ProjectInfoCacheImpl { fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) { info!("invalidating allowed ips for project `{}`", project_id); @@ -148,15 +153,27 @@ impl ProjectInfoCache for ProjectInfoCacheImpl { } } } - fn enable_ttl(&self) { - self.ttl_disabled_since_us - .store(u64::MAX, std::sync::atomic::Ordering::Relaxed); + async fn decrement_active_listeners(&self) { + let mut listeners_guard = self.active_listeners_lock.lock().await; + if *listeners_guard == 0 { + tracing::error!("active_listeners count is already 0, something is broken"); + return; + } + *listeners_guard -= 1; + if *listeners_guard == 0 { + self.ttl_disabled_since_us + .store(u64::MAX, std::sync::atomic::Ordering::SeqCst); + } } - fn disable_ttl(&self) { - let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64; - self.ttl_disabled_since_us - .store(new_ttl, std::sync::atomic::Ordering::Relaxed); + async fn increment_active_listeners(&self) { + let mut listeners_guard = self.active_listeners_lock.lock().await; + *listeners_guard += 1; + if *listeners_guard == 1 { + let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64; + self.ttl_disabled_since_us + .store(new_ttl, std::sync::atomic::Ordering::SeqCst); + } } } @@ -168,6 +185,7 @@ impl ProjectInfoCacheImpl { config, ttl_disabled_since_us: AtomicU64::new(u64::MAX), start_time: Instant::now(), + active_listeners_lock: Mutex::new(0), } } @@ -432,7 +450,7 @@ mod tests { ttl: Duration::from_secs(1), gc_interval: Duration::from_secs(600), })); - cache.clone().disable_ttl(); + cache.clone().increment_active_listeners().await; tokio::time::advance(Duration::from_secs(2)).await; let project_id: ProjectId = "project".into(); @@ -489,7 +507,7 @@ mod tests { } #[tokio::test] - async fn test_disable_ttl_invalidate_added_before() { + async fn test_increment_active_listeners_invalidate_added_before() { tokio::time::pause(); let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions { size: 2, @@ -514,7 +532,7 @@ mod tests { (&user1).into(), secret1.clone(), ); - cache.clone().disable_ttl(); + cache.clone().increment_active_listeners().await; tokio::time::advance(Duration::from_millis(100)).await; cache.insert_role_secret( (&project_id).into(), diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 6151513614..34512e9f5b 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -10,7 +10,7 @@ use uuid::Uuid; use crate::{ error::ReportableError, - metrics::NUM_CANCELLATION_REQUESTS, + metrics::{CancellationRequest, CancellationSource, Metrics}, redis::cancellation_publisher::{ CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, }, @@ -28,7 +28,7 @@ pub struct CancellationHandler

{ client: P, /// This field used for the monitoring purposes. /// Represents the source of the cancellation request. - from: &'static str, + from: CancellationSource, } #[derive(Debug, Error)] @@ -89,9 +89,13 @@ impl CancellationHandler

{ // NB: we should immediately release the lock after cloning the token. let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { tracing::warn!("query cancellation key not found: {key}"); - NUM_CANCELLATION_REQUESTS - .with_label_values(&[self.from, "not_found"]) - .inc(); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::NotFound, + }); match self.client.try_publish(key, session_id).await { Ok(()) => {} // do nothing Err(e) => { @@ -103,9 +107,13 @@ impl CancellationHandler

{ } return Ok(()); }; - NUM_CANCELLATION_REQUESTS - .with_label_values(&[self.from, "found"]) - .inc(); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::Found, + }); info!("cancelling query per user's request using key {key}"); cancel_closure.try_cancel_query().await } @@ -122,7 +130,7 @@ impl CancellationHandler

{ } impl CancellationHandler<()> { - pub fn new(map: CancelMap, from: &'static str) -> Self { + pub fn new(map: CancelMap, from: CancellationSource) -> Self { Self { map, client: (), @@ -132,7 +140,7 @@ impl CancellationHandler<()> { } impl CancellationHandler>>> { - pub fn new(map: CancelMap, client: Option>>, from: &'static str) -> Self { + pub fn new(map: CancelMap, client: Option>>, from: CancellationSource) -> Self { Self { map, client, from } } } @@ -192,15 +200,13 @@ impl

Drop for Session

{ #[cfg(test)] mod tests { - use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS; - use super::*; #[tokio::test] async fn check_session_drop() -> anyhow::Result<()> { let cancellation_handler = Arc::new(CancellationHandler::<()>::new( CancelMap::default(), - NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, + CancellationSource::FromRedis, )); let session = cancellation_handler.clone().get_session(); @@ -214,7 +220,7 @@ mod tests { #[tokio::test] async fn cancel_session_noop_regression() { - let handler = CancellationHandler::<()>::new(Default::default(), "local"); + let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local); handler .cancel_session( CancelKeyData { diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index ee33b97fbd..23266ac4ef 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -4,12 +4,12 @@ use crate::{ console::{errors::WakeComputeError, messages::MetricsAuxInfo}, context::RequestMonitoring, error::{ReportableError, UserFacingError}, - metrics::NUM_DB_CONNECTIONS_GAUGE, + metrics::{Metrics, NumDbConnectionsGuard}, proxy::neon_option, + Host, }; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; -use metrics::IntCounterPairGuard; use pq_proto::StartupMessageParams; use std::{io, net::SocketAddr, time::Duration}; use thiserror::Error; @@ -102,6 +102,16 @@ impl ConnCfg { } } + pub fn get_host(&self) -> Result { + match self.0.get_hosts() { + [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()), + // we should not have multiple address or unix addresses. + _ => Err(WakeComputeError::BadComputeAddress( + "invalid compute address".into(), + )), + } + } + /// Apply startup message params to the connection config. pub fn set_startup_params(&mut self, params: &StartupMessageParams) { // Only set `user` if it's not present in the config. @@ -249,7 +259,7 @@ pub struct PostgresConnection { /// Labels for proxy's metrics. pub aux: MetricsAuxInfo, - _guage: IntCounterPairGuard, + _guage: NumDbConnectionsGuard<'static>, } impl ConnCfg { @@ -261,7 +271,9 @@ impl ConnCfg { aux: MetricsAuxInfo, timeout: Duration, ) -> Result { + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); let (socket_addr, stream, host) = self.connect_raw(timeout).await?; + drop(pause); let tls_connector = native_tls::TlsConnector::builder() .danger_accept_invalid_certs(allow_self_signed_compute) @@ -271,7 +283,9 @@ impl ConnCfg { let tls = MakeTlsConnect::::make_tls_connect(&mut mk_tls, host)?; // connect_raw() will not use TLS if sslmode is "disable" + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); let (client, connection) = self.0.connect_raw(stream, tls).await?; + drop(pause); tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); let stream = connection.stream.into_inner(); @@ -295,9 +309,7 @@ impl ConnCfg { params, cancel_closure, aux, - _guage: NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(), + _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol), }; Ok(connection) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index fc490c7348..0c8e284d0b 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,7 +1,9 @@ use crate::{ - auth, - rate_limiter::{AuthRateLimiter, RateBucketInfo}, + auth::{self, backend::AuthRateLimiter}, + console::locks::ApiLocks, + rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions, + Host, }; use anyhow::{bail, ensure, Context, Ok}; use itertools::Itertools; @@ -29,11 +31,13 @@ pub struct ProxyConfig { pub authentication_config: AuthenticationConfig, pub require_client_ip: bool, pub disable_ip_check_for_http: bool, - pub endpoint_rps_limit: Vec, pub redis_rps_limit: Vec, pub region: String, pub handshake_timeout: Duration, pub aws_region: String, + pub wake_compute_retry_config: RetryConfig, + pub connect_compute_locks: ApiLocks, + pub connect_to_compute_retry_config: RetryConfig, } #[derive(Debug)] @@ -58,6 +62,7 @@ pub struct AuthenticationConfig { pub scram_protocol_timeout: tokio::time::Duration, pub rate_limiter_enabled: bool, pub rate_limiter: AuthRateLimiter, + pub rate_limit_ip_subnet: u8, } impl TlsConfig { @@ -313,6 +318,80 @@ impl CertResolver { } } +#[derive(Debug)] +pub struct EndpointCacheConfig { + /// Batch size to receive all endpoints on the startup. + pub initial_batch_size: usize, + /// Batch size to receive endpoints. + pub default_batch_size: usize, + /// Timeouts for the stream read operation. + pub xread_timeout: Duration, + /// Stream name to read from. + pub stream_name: String, + /// Limiter info (to distinguish when to enable cache). + pub limiter_info: Vec, + /// Disable cache. + /// If true, cache is ignored, but reports all statistics. + pub disable_cache: bool, + /// Retry interval for the stream read operation. + pub retry_interval: Duration, +} + +impl EndpointCacheConfig { + /// Default options for [`crate::console::provider::NodeInfoCache`]. + /// Notice that by default the limiter is empty, which means that cache is disabled. + pub const CACHE_DEFAULT_OPTIONS: &'static str = + "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s"; + + /// Parse cache options passed via cmdline. + /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. + fn parse(options: &str) -> anyhow::Result { + let mut initial_batch_size = None; + let mut default_batch_size = None; + let mut xread_timeout = None; + let mut stream_name = None; + let mut limiter_info = vec![]; + let mut disable_cache = false; + let mut retry_interval = None; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "initial_batch_size" => initial_batch_size = Some(value.parse()?), + "default_batch_size" => default_batch_size = Some(value.parse()?), + "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?), + "stream_name" => stream_name = Some(value.to_string()), + "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?), + "disable_cache" => disable_cache = value.parse()?, + "retry_interval" => retry_interval = Some(humantime::parse_duration(value)?), + unknown => bail!("unknown key: {unknown}"), + } + } + RateBucketInfo::validate(&mut limiter_info)?; + + Ok(Self { + initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?, + default_batch_size: default_batch_size.context("missing `default_batch_size`")?, + xread_timeout: xread_timeout.context("missing `xread_timeout`")?, + stream_name: stream_name.context("missing `stream_name`")?, + disable_cache, + limiter_info, + retry_interval: retry_interval.context("missing `retry_interval`")?, + }) + } +} + +impl FromStr for EndpointCacheConfig { + type Err = anyhow::Error; + + fn from_str(options: &str) -> Result { + let error = || format!("failed to parse endpoint cache options '{options}'"); + Self::parse(options).with_context(error) + } +} #[derive(Debug)] pub struct MetricBackupCollectionConfig { pub interval: Duration, @@ -443,8 +522,61 @@ impl FromStr for ProjectInfoCacheOptions { } } +/// This is a config for connect to compute and wake compute. +#[derive(Clone, Copy, Debug)] +pub struct RetryConfig { + /// Number of times we should retry. + pub max_retries: u32, + /// Retry duration is base_delay * backoff_factor ^ n, where n starts at 0 + pub base_delay: tokio::time::Duration, + /// Exponential base for retry wait duration + pub backoff_factor: f64, +} + +impl RetryConfig { + /// Default options for RetryConfig. + + /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s. + pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str = + "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6"; + /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s. + /// Cplane has timeout of 60s on each request. 8m7s in total. + pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str = + "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6"; + + /// Parse retry options passed via cmdline. + /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`]. + pub fn parse(options: &str) -> anyhow::Result { + let mut num_retries = None; + let mut base_retry_wait_duration = None; + let mut retry_wait_exponent_base = None; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "num_retries" => num_retries = Some(value.parse()?), + "base_retry_wait_duration" => { + base_retry_wait_duration = Some(humantime::parse_duration(value)?) + } + "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?), + unknown => bail!("unknown key: {unknown}"), + } + } + + Ok(Self { + max_retries: num_retries.context("missing `num_retries`")?, + base_delay: base_retry_wait_duration.context("missing `base_retry_wait_duration`")?, + backoff_factor: retry_wait_exponent_base + .context("missing `retry_wait_exponent_base`")?, + }) + } +} + /// Helper for cmdline cache options parsing. -pub struct WakeComputeLockOptions { +pub struct ConcurrencyLockOptions { /// The number of shards the lock map should have pub shards: usize, /// The number of allowed concurrent requests for each endpoitn @@ -455,9 +587,12 @@ pub struct WakeComputeLockOptions { pub timeout: Duration, } -impl WakeComputeLockOptions { +impl ConcurrencyLockOptions { /// Default options for [`crate::console::provider::ApiLocks`]. pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0"; + /// Default options for [`crate::console::provider::ApiLocks`]. + pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str = + "shards=64,permits=50,epoch=10m,timeout=500ms"; // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s"; @@ -507,7 +642,7 @@ impl WakeComputeLockOptions { } } -impl FromStr for WakeComputeLockOptions { +impl FromStr for ConcurrencyLockOptions { type Err = anyhow::Error; fn from_str(options: &str) -> Result { @@ -543,7 +678,7 @@ mod tests { #[test] fn test_parse_lock_options() -> anyhow::Result<()> { - let WakeComputeLockOptions { + let ConcurrencyLockOptions { epoch, permits, shards, @@ -554,7 +689,7 @@ mod tests { assert_eq!(shards, 32); assert_eq!(permits, 4); - let WakeComputeLockOptions { + let ConcurrencyLockOptions { epoch, permits, shards, @@ -565,7 +700,7 @@ mod tests { assert_eq!(shards, 16); assert_eq!(permits, 8); - let WakeComputeLockOptions { + let ConcurrencyLockOptions { epoch, permits, shards, diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 45161f5ac8..9869b95768 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,3 +1,4 @@ +use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; use std::fmt; @@ -102,7 +103,7 @@ pub struct MetricsAuxInfo { pub cold_start_info: ColdStartInfo, } -#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)] +#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)] #[serde(rename_all = "snake_case")] pub enum ColdStartInfo { #[default] @@ -110,9 +111,11 @@ pub enum ColdStartInfo { /// Compute was already running Warm, #[serde(rename = "pool_hit")] + #[label(rename = "pool_hit")] /// Compute was not running but there was an available VM VmPoolHit, #[serde(rename = "pool_miss")] + #[label(rename = "pool_miss")] /// Compute was not running and there were no VMs available VmPoolMiss, diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index f7d621fb12..dfda29e0b1 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -8,15 +8,16 @@ use crate::{ backend::{ComputeCredentialKeys, ComputeUserInfo}, IpPattern, }, - cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru}, + cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, compute, - config::{CacheOptions, ProjectInfoCacheOptions}, + config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, context::RequestMonitoring, intern::ProjectIdInt, + metrics::ApiLockMetrics, scram, EndpointCacheKey, }; use dashmap::DashMap; -use std::{sync::Arc, time::Duration}; +use std::{hash::Hash, sync::Arc, time::Duration}; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tokio::time::Instant; use tracing::info; @@ -207,6 +208,9 @@ pub mod errors { #[error(transparent)] ApiError(ApiError), + #[error("Too many connections attempts")] + TooManyConnections, + #[error("Timeout waiting to acquire wake compute lock")] TimeoutError, } @@ -239,6 +243,8 @@ pub mod errors { // However, API might return a meaningful error. ApiError(e) => e.to_string_client(), + TooManyConnections => self.to_string(), + TimeoutError => "timeout while acquiring the compute resource lock".to_owned(), } } @@ -249,6 +255,7 @@ pub mod errors { match self { WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane, WakeComputeError::ApiError(e) => e.get_error_kind(), + WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit, WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit, } } @@ -416,12 +423,15 @@ pub struct ApiCaches { pub node_info: NodeInfoCache, /// Cache which stores project_id -> endpoint_ids mapping. pub project_info: Arc, + /// List of all valid endpoints. + pub endpoints_cache: Arc, } impl ApiCaches { pub fn new( wake_compute_cache_config: CacheOptions, project_info_cache_config: ProjectInfoCacheOptions, + endpoint_cache_config: EndpointCacheConfig, ) -> Self { Self { node_info: NodeInfoCache::new( @@ -431,83 +441,41 @@ impl ApiCaches { true, ), project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)), + endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)), } } } /// Various caches for [`console`](super). -pub struct ApiLocks { +pub struct ApiLocks { name: &'static str, - node_locks: DashMap>, + node_locks: DashMap>, permits: usize, timeout: Duration, - registered: prometheus::IntCounter, - unregistered: prometheus::IntCounter, - reclamation_lag: prometheus::Histogram, - lock_acquire_lag: prometheus::Histogram, + epoch: std::time::Duration, + metrics: &'static ApiLockMetrics, } -impl ApiLocks { +impl ApiLocks { pub fn new( name: &'static str, permits: usize, shards: usize, timeout: Duration, + epoch: std::time::Duration, + metrics: &'static ApiLockMetrics, ) -> prometheus::Result { - let registered = prometheus::IntCounter::with_opts( - prometheus::Opts::new( - "semaphores_registered", - "Number of semaphores registered in this api lock", - ) - .namespace(name), - )?; - prometheus::register(Box::new(registered.clone()))?; - let unregistered = prometheus::IntCounter::with_opts( - prometheus::Opts::new( - "semaphores_unregistered", - "Number of semaphores unregistered in this api lock", - ) - .namespace(name), - )?; - prometheus::register(Box::new(unregistered.clone()))?; - let reclamation_lag = prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "reclamation_lag_seconds", - "Time it takes to reclaim unused semaphores in the api lock", - ) - .namespace(name) - // 1us -> 65ms - // benchmarks on my mac indicate it's usually in the range of 256us and 512us - .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?), - )?; - prometheus::register(Box::new(reclamation_lag.clone()))?; - let lock_acquire_lag = prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "semaphore_acquire_seconds", - "Time it takes to reclaim unused semaphores in the api lock", - ) - .namespace(name) - // 0.1ms -> 6s - .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?), - )?; - prometheus::register(Box::new(lock_acquire_lag.clone()))?; - Ok(Self { name, node_locks: DashMap::with_shard_amount(shards), permits, timeout, - lock_acquire_lag, - registered, - unregistered, - reclamation_lag, + epoch, + metrics, }) } - pub async fn get_wake_compute_permit( - &self, - key: &EndpointCacheKey, - ) -> Result { + pub async fn get_permit(&self, key: &K) -> Result { if self.permits == 0 { return Ok(WakeComputePermit { permit: None }); } @@ -520,7 +488,7 @@ impl ApiLocks { self.node_locks .entry(key.clone()) .or_insert_with(|| { - self.registered.inc(); + self.metrics.semaphores_registered.inc(); Arc::new(Semaphore::new(self.permits)) }) .clone() @@ -528,20 +496,21 @@ impl ApiLocks { }; let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await; - self.lock_acquire_lag - .observe((Instant::now() - now).as_secs_f64()); + self.metrics + .semaphore_acquire_seconds + .observe(now.elapsed().as_secs_f64()); Ok(WakeComputePermit { permit: Some(permit??), }) } - pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) { + pub async fn garbage_collect_worker(&self) { if self.permits == 0 { return; } - - let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32); + let mut interval = + tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32); loop { for (i, shard) in self.node_locks.shards().iter().enumerate() { interval.tick().await; @@ -554,13 +523,13 @@ impl ApiLocks { "performing epoch reclamation on api lock" ); let mut lock = shard.write(); - let timer = self.reclamation_lag.start_timer(); + let timer = self.metrics.reclamation_lag_seconds.start_timer(); let count = lock .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1) .count(); drop(lock); - self.unregistered.inc_by(count as u64); - timer.observe_duration() + self.metrics.semaphores_unregistered.inc_by(count as u64); + timer.observe(); } } } diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 1a3e2ca795..ec66641d01 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -7,13 +7,15 @@ use super::{ NodeInfo, }; use crate::{ - auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram, -}; -use crate::{ - cache::Cached, - context::RequestMonitoring, - metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}, + auth::backend::ComputeUserInfo, + compute, + console::messages::ColdStartInfo, + http, + metrics::{CacheOutcome, Metrics}, + rate_limiter::EndpointRateLimiter, + scram, EndpointCacheKey, Normalize, }; +use crate::{cache::Cached, context::RequestMonitoring}; use futures::TryFutureExt; use std::sync::Arc; use tokio::time::Instant; @@ -23,7 +25,8 @@ use tracing::{error, info, info_span, warn, Instrument}; pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, - locks: &'static ApiLocks, + pub locks: &'static ApiLocks, + pub endpoint_rate_limiter: Arc, jwt: String, } @@ -32,7 +35,8 @@ impl Api { pub fn new( endpoint: http::Endpoint, caches: &'static ApiCaches, - locks: &'static ApiLocks, + locks: &'static ApiLocks, + endpoint_rate_limiter: Arc, ) -> Self { let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") { Ok(v) => v, @@ -42,6 +46,7 @@ impl Api { endpoint, caches, locks, + endpoint_rate_limiter, jwt, } } @@ -55,6 +60,15 @@ impl Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { + if !self + .caches + .endpoints_cache + .is_valid(ctx, &user_info.endpoint.normalize()) + .await + { + info!("endpoint is not valid, skipping the request"); + return Ok(AuthInfo::default()); + } let request_id = ctx.session_id.to_string(); let application_name = ctx.console_application_name(); async { @@ -81,7 +95,9 @@ impl Api { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. Err(e) => match e.http_status_code() { - Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()), + Some(http::StatusCode::NOT_FOUND) => { + return Ok(AuthInfo::default()); + } _otherwise => return Err(e.into()), }, }; @@ -95,7 +111,10 @@ impl Api { Some(secret) }; let allowed_ips = body.allowed_ips.unwrap_or_default(); - ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64); + Metrics::get() + .proxy + .allowed_ips_number + .observe(allowed_ips.len() as f64); Ok(AuthInfo { secret, allowed_ips, @@ -174,23 +193,27 @@ impl super::Api for Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let ep = &user_info.endpoint; + let normalized_ep = &user_info.endpoint.normalize(); let user = &user_info.user; - if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) { + if let Some(role_secret) = self + .caches + .project_info + .get_role_secret(normalized_ep, user) + { return Ok(role_secret); } let auth_info = self.do_get_auth_info(ctx, user_info).await?; if let Some(project_id) = auth_info.project_id { - let ep_int = ep.into(); + let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( project_id, - ep_int, + normalized_ep_int, user.into(), auth_info.secret.clone(), ); self.caches.project_info.insert_allowed_ips( project_id, - ep_int, + normalized_ep_int, Arc::new(auth_info.allowed_ips), ); ctx.set_project_id(project_id); @@ -204,30 +227,34 @@ impl super::Api for Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { - let ep = &user_info.endpoint; - if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) { - ALLOWED_IPS_BY_CACHE_OUTCOME - .with_label_values(&["hit"]) - .inc(); + let normalized_ep = &user_info.endpoint.normalize(); + if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Hit); return Ok((allowed_ips, None)); } - ALLOWED_IPS_BY_CACHE_OUTCOME - .with_label_values(&["miss"]) - .inc(); + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Miss); let auth_info = self.do_get_auth_info(ctx, user_info).await?; let allowed_ips = Arc::new(auth_info.allowed_ips); let user = &user_info.user; if let Some(project_id) = auth_info.project_id { - let ep_int = ep.into(); + let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( project_id, - ep_int, + normalized_ep_int, user.into(), auth_info.secret.clone(), ); - self.caches - .project_info - .insert_allowed_ips(project_id, ep_int, allowed_ips.clone()); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + allowed_ips.clone(), + ); ctx.set_project_id(project_id); } Ok(( @@ -254,7 +281,15 @@ impl super::Api for Api { return Ok(cached); } - let permit = self.locks.get_wake_compute_permit(&key).await?; + // check rate limit + if !self + .endpoint_rate_limiter + .check(user_info.endpoint.normalize().into(), 1) + { + return Err(WakeComputeError::TooManyConnections); + } + + let permit = self.locks.get_permit(&key).await?; // after getting back a permit - it's possible the cache was filled // double check diff --git a/proxy/src/context.rs b/proxy/src/context.rs index fec95f4722..dfd3ef108e 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -5,14 +5,14 @@ use once_cell::sync::OnceCell; use smol_str::SmolStr; use std::net::IpAddr; use tokio::sync::mpsc; -use tracing::{field::display, info_span, Span}; +use tracing::{field::display, info, info_span, Span}; use uuid::Uuid; use crate::{ console::messages::{ColdStartInfo, MetricsAuxInfo}, error::ErrorKind, intern::{BranchIdInt, ProjectIdInt}, - metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND}, + metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol}, DbName, EndpointId, RoleName, }; @@ -20,7 +20,8 @@ use self::parquet::RequestData; pub mod parquet; -static LOG_CHAN: OnceCell> = OnceCell::new(); +pub static LOG_CHAN: OnceCell> = OnceCell::new(); +pub static LOG_CHAN_DISCONNECT: OnceCell> = OnceCell::new(); /// Context data for a single request to connect to a database. /// @@ -29,7 +30,7 @@ static LOG_CHAN: OnceCell> = OnceCell::ne pub struct RequestMonitoring { pub peer_addr: IpAddr, pub session_id: Uuid, - pub protocol: &'static str, + pub protocol: Protocol, first_packet: chrono::DateTime, region: &'static str, pub span: Span, @@ -49,7 +50,12 @@ pub struct RequestMonitoring { // extra // This sender is here to keep the request monitoring channel open while requests are taking place. sender: Option>, + // This sender is only used to log the length of session in case of success. + disconnect_sender: Option>, pub latency_timer: LatencyTimer, + // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane. + rejected: Option, + disconnect_timestamp: Option>, } #[derive(Clone, Debug)] @@ -65,7 +71,7 @@ impl RequestMonitoring { pub fn new( session_id: Uuid, peer_addr: IpAddr, - protocol: &'static str, + protocol: Protocol, region: &'static str, ) -> Self { let span = info_span!( @@ -74,6 +80,7 @@ impl RequestMonitoring { ?session_id, %peer_addr, ep = tracing::field::Empty, + role = tracing::field::Empty, ); Self { @@ -93,16 +100,19 @@ impl RequestMonitoring { error_kind: None, auth_method: None, success: false, + rejected: None, cold_start_info: ColdStartInfo::Unknown, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), + disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()), latency_timer: LatencyTimer::new(protocol), + disconnect_timestamp: None, } } #[cfg(test)] pub fn test() -> Self { - RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test") + RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test") } pub fn console_application_name(&self) -> String { @@ -113,6 +123,10 @@ impl RequestMonitoring { ) } + pub fn set_rejected(&mut self, rejected: bool) { + self.rejected = Some(rejected); + } + pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { self.cold_start_info = info; self.latency_timer.cold_start_info(info); @@ -134,9 +148,9 @@ impl RequestMonitoring { pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { if self.endpoint_id.is_none() { self.span.record("ep", display(&endpoint_id)); - crate::metrics::CONNECTING_ENDPOINTS - .with_label_values(&[self.protocol]) - .measure(&endpoint_id); + let metric = &Metrics::get().proxy.connecting_endpoints; + let label = metric.with_labels(self.protocol); + metric.get_metric(label).measure(&endpoint_id); self.endpoint_id = Some(endpoint_id); } } @@ -150,6 +164,7 @@ impl RequestMonitoring { } pub fn set_user(&mut self, user: RoleName) { + self.span.record("role", display(&user)); self.user = Some(user); } @@ -157,14 +172,22 @@ impl RequestMonitoring { self.auth_method = Some(auth_method); } + pub fn has_private_peer_addr(&self) -> bool { + match self.peer_addr { + IpAddr::V4(ip) => ip.is_private(), + _ => false, + } + } + pub fn set_error_kind(&mut self, kind: ErrorKind) { - ERROR_BY_KIND - .with_label_values(&[kind.to_metric_label()]) - .inc(); + // Do not record errors from the private address to metrics. + if !self.has_private_peer_addr() { + Metrics::get().proxy.errors_total.inc(kind); + } if let Some(ep) = &self.endpoint_id { - ENDPOINT_ERRORS_BY_KIND - .with_label_values(&[kind.to_metric_label()]) - .measure(ep); + let metric = &Metrics::get().proxy.endpoints_affected_by_errors; + let label = metric.with_labels(kind); + metric.get_metric(label).measure(ep); } self.error_kind = Some(kind); } @@ -173,13 +196,55 @@ impl RequestMonitoring { self.success = true; } - pub fn log(self) {} -} - -impl Drop for RequestMonitoring { - fn drop(&mut self) { + pub fn log_connect(&mut self) { + let outcome = if self.success { + ConnectOutcome::Success + } else { + ConnectOutcome::Failed + }; + if let Some(rejected) = self.rejected { + let ep = self + .endpoint_id + .as_ref() + .map(|x| x.as_str()) + .unwrap_or_default(); + // This makes sense only if cache is disabled + info!( + ?outcome, + ?rejected, + ?ep, + "check endpoint is valid with outcome" + ); + Metrics::get() + .proxy + .invalid_endpoints_total + .inc(InvalidEndpointsGroup { + protocol: self.protocol, + rejected: rejected.into(), + outcome, + }); + } if let Some(tx) = self.sender.take() { let _: Result<(), _> = tx.send(RequestData::from(&*self)); } } + + fn log_disconnect(&mut self) { + // If we are here, it's guaranteed that the user successfully connected to the endpoint. + // Here we log the length of the session. + self.disconnect_timestamp = Some(Utc::now()); + if let Some(tx) = self.disconnect_sender.take() { + let _: Result<(), _> = tx.send(RequestData::from(&*self)); + } + } +} + +impl Drop for RequestMonitoring { + fn drop(&mut self) { + if self.sender.is_some() { + self.log_connect(); + } else { + self.log_disconnect(); + } + } } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index eb77409429..8104fe6087 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -19,7 +19,10 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; -use crate::config::{remote_storage_from_toml, OptRemoteStorageConfig}; +use crate::{ + config::{remote_storage_from_toml, OptRemoteStorageConfig}, + context::LOG_CHAN_DISCONNECT, +}; use super::{RequestMonitoring, LOG_CHAN}; @@ -31,6 +34,9 @@ pub struct ParquetUploadArgs { #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] parquet_upload_remote_storage: OptRemoteStorageConfig, + #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] + parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig, + /// How many rows to include in a row group #[clap(long, default_value_t = 8192)] parquet_upload_row_group_size: usize, @@ -91,6 +97,8 @@ pub struct RequestData { /// Tracks time from session start (HTTP request/libpq TCP handshake) /// Through to success/failure duration_us: u64, + /// If the session was successful after the disconnect, will be created one more event with filled `disconnect_timestamp`. + disconnect_timestamp: Option, } impl From<&RequestMonitoring> for RequestData { @@ -111,7 +119,7 @@ impl From<&RequestMonitoring> for RequestData { super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus", super::AuthMethod::Cleartext => "cleartext", }), - protocol: value.protocol, + protocol: value.protocol.as_str(), region: value.region, error: value.error_kind.as_ref().map(|e| e.to_metric_label()), success: value.success, @@ -120,6 +128,7 @@ impl From<&RequestMonitoring> for RequestData { .elapsed() .unwrap_or_default() .as_micros() as u64, // 584 millenia... good enough + disconnect_timestamp: value.disconnect_timestamp.map(|x| x.naive_utc()), } } } @@ -141,8 +150,9 @@ pub async fn worker( LOG_CHAN.set(tx.downgrade()).unwrap(); // setup row stream that will close on cancellation + let cancellation_token2 = cancellation_token.clone(); tokio::spawn(async move { - cancellation_token.cancelled().await; + cancellation_token2.cancelled().await; // dropping this sender will cause the channel to close only once // all the remaining inflight requests have been completed. drop(tx); @@ -167,9 +177,38 @@ pub async fn worker( test_remote_failures: 0, }; - worker_inner(storage, rx, parquet_config).await + // TODO(anna): consider moving this to a separate function. + if let Some(disconnect_events_storage_config) = + config.parquet_upload_disconnect_events_remote_storage + { + let (tx_disconnect, mut rx_disconnect) = mpsc::unbounded_channel(); + LOG_CHAN_DISCONNECT.set(tx_disconnect.downgrade()).unwrap(); + + // setup row stream that will close on cancellation + tokio::spawn(async move { + cancellation_token.cancelled().await; + // dropping this sender will cause the channel to close only once + // all the remaining inflight requests have been completed. + drop(tx_disconnect); + }); + let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx)); + let rx_disconnect = rx_disconnect.map(RequestData::from); + + let storage_disconnect = + GenericRemoteStorage::from_config(&disconnect_events_storage_config) + .context("remote storage for disconnect events init")?; + let parquet_config_disconnect = parquet_config.clone(); + tokio::try_join!( + worker_inner(storage, rx, parquet_config), + worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect) + ) + .map(|_| ()) + } else { + worker_inner(storage, rx, parquet_config).await + } } +#[derive(Clone, Debug)] struct ParquetConfig { propeties: WriterPropertiesPtr, rows_per_group: usize, @@ -413,6 +452,7 @@ mod tests { ) .unwrap(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }) @@ -451,6 +491,7 @@ mod tests { success: rng.gen(), cold_start_info: "no", duration_us: rng.gen_range(0..30_000_000), + disconnect_timestamp: None, } } @@ -519,15 +560,15 @@ mod tests { assert_eq!( file_stats, [ - (1314385, 3, 6000), - (1314378, 3, 6000), - (1314438, 3, 6000), - (1314395, 3, 6000), - (1314525, 3, 6000), - (1314367, 3, 6000), - (1314159, 3, 6000), - (1314395, 3, 6000), - (438352, 1, 2000) + (1315008, 3, 6000), + (1315001, 3, 6000), + (1315061, 3, 6000), + (1315018, 3, 6000), + (1315148, 3, 6000), + (1314990, 3, 6000), + (1314782, 3, 6000), + (1315018, 3, 6000), + (438575, 1, 2000) ] ); @@ -557,11 +598,11 @@ mod tests { assert_eq!( file_stats, [ - (1220633, 5, 10000), - (1226783, 5, 10000), - (1228577, 5, 10000), - (1227939, 5, 10000), - (1219217, 5, 10000) + (1221738, 5, 10000), + (1227888, 5, 10000), + (1229682, 5, 10000), + (1229044, 5, 10000), + (1220322, 5, 10000) ] ); @@ -593,11 +634,11 @@ mod tests { assert_eq!( file_stats, [ - (1206280, 5, 10000), - (1206011, 5, 10000), - (1206304, 5, 10000), - (1206292, 5, 10000), - (1206547, 5, 10000) + (1207385, 5, 10000), + (1207116, 5, 10000), + (1207409, 5, 10000), + (1207397, 5, 10000), + (1207652, 5, 10000) ] ); @@ -622,15 +663,15 @@ mod tests { assert_eq!( file_stats, [ - (1314385, 3, 6000), - (1314378, 3, 6000), - (1314438, 3, 6000), - (1314395, 3, 6000), - (1314525, 3, 6000), - (1314367, 3, 6000), - (1314159, 3, 6000), - (1314395, 3, 6000), - (438352, 1, 2000) + (1315008, 3, 6000), + (1315001, 3, 6000), + (1315061, 3, 6000), + (1315018, 3, 6000), + (1315148, 3, 6000), + (1314990, 3, 6000), + (1314782, 3, 6000), + (1315018, 3, 6000), + (438575, 1, 2000) ] ); @@ -667,7 +708,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(658823, 2, 3001), (658537, 2, 3000), (658333, 2, 2999)] + [(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 4614f3913d..fdfe50a494 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -1,5 +1,7 @@ use std::{error::Error as StdError, fmt, io}; +use measured::FixedCardinalityLabel; + /// Upcast (almost) any error into an opaque [`io::Error`]. pub fn io_error(e: impl Into>) -> io::Error { io::Error::new(io::ErrorKind::Other, e) @@ -29,24 +31,29 @@ pub trait UserFacingError: ReportableError { } } -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)] +#[label(singleton = "type")] pub enum ErrorKind { /// Wrong password, unknown endpoint, protocol violation, etc... User, /// Network error between user and proxy. Not necessarily user error + #[label(rename = "clientdisconnect")] ClientDisconnect, /// Proxy self-imposed user rate limits + #[label(rename = "ratelimit")] RateLimit, /// Proxy self-imposed service-wise rate limits + #[label(rename = "serviceratelimit")] ServiceRateLimit, /// internal errors Service, /// Error communicating with control plane + #[label(rename = "controlplane")] ControlPlane, /// Postgres error diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 59e1492ed4..fc7400869f 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -4,7 +4,7 @@ pub mod health_server; -use std::{sync::Arc, time::Duration}; +use std::{str::FromStr, sync::Arc, time::Duration}; use futures::FutureExt; pub use reqwest::{Request, Response, StatusCode}; @@ -13,13 +13,16 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio::time::Instant; use tracing::trace; -use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl}; +use crate::{ + metrics::{ConsoleRequest, Metrics}, + url::ApiUrl, +}; use reqwest_middleware::RequestBuilder; /// This is the preferred way to create new http clients, /// because it takes care of observability (OpenTelemetry). /// We deliberately don't want to replace this with a public static. -pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware { +pub fn new_client() -> ClientWithMiddleware { let client = reqwest::ClientBuilder::new() .dns_resolver(Arc::new(GaiResolver::default())) .connection_verbose(true) @@ -28,7 +31,6 @@ pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> Clien reqwest_middleware::ClientBuilder::new(client) .with(reqwest_tracing::TracingMiddleware::default()) - .with(rate_limiter::Limiter::new(rate_limiter_config)) .build() } @@ -90,22 +92,23 @@ impl Endpoint { /// Execute a [request](reqwest::Request). pub async fn execute(&self, request: Request) -> Result { - let path = request.url().path().to_string(); - let start = Instant::now(); - let res = self.client.execute(request).await; - CONSOLE_REQUEST_LATENCY - .with_label_values(&[&path]) - .observe(start.elapsed().as_secs_f64()); - res + let _timer = Metrics::get() + .proxy + .console_request_latency + .start_timer(ConsoleRequest { + request: request.url().path(), + }); + + self.client.execute(request).await } } -/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html -use hyper::{ - client::connect::dns::{GaiResolver as HyperGaiResolver, Name}, - service::Service, +use hyper_util::client::legacy::connect::dns::{ + GaiResolver as HyperGaiResolver, Name as HyperName, }; -use reqwest::dns::{Addrs, Resolve, Resolving}; +use reqwest::dns::{Addrs, Name, Resolve, Resolving}; +/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html +use tower_service::Service; #[derive(Debug)] pub struct GaiResolver(HyperGaiResolver); @@ -118,11 +121,12 @@ impl Default for GaiResolver { impl Resolve for GaiResolver { fn resolve(&self, name: Name) -> Resolving { let this = &mut self.0.clone(); + let hyper_name = HyperName::from_str(name.as_str()).expect("name should be valid"); let start = Instant::now(); Box::pin( - Service::::call(this, name.clone()).map(move |result| { + Service::::call(this, hyper_name).map(move |result| { let resolve_duration = start.elapsed(); - trace!(duration = ?resolve_duration, addr = %name, "resolve host complete"); + trace!(duration = ?resolve_duration, addr = %name.as_str(), "resolve host complete"); result .map(|addrs| -> Addrs { Box::new(addrs) }) .map_err(|err| -> Box { Box::new(err) }) diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index cbb17ebcb7..cae9eb5b97 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -1,30 +1,49 @@ use anyhow::{anyhow, bail}; -use hyper::{Body, Request, Response, StatusCode}; -use std::{convert::Infallible, net::TcpListener}; -use tracing::info; +use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode}; +use measured::{text::BufferedTextEncoder, MetricGroup}; +use metrics::NeonMetrics; +use std::{ + convert::Infallible, + net::TcpListener, + sync::{Arc, Mutex}, +}; +use tracing::{info, info_span}; use utils::http::{ - endpoint::{self, prometheus_metrics_handler, request_span}, + endpoint::{self, request_span}, error::ApiError, json::json_response, RouterBuilder, RouterService, }; +use crate::jemalloc; + async fn status_handler(_: Request) -> Result, ApiError> { json_response(StatusCode::OK, "") } -fn make_router() -> RouterBuilder { +fn make_router(metrics: AppMetrics) -> RouterBuilder { + let state = Arc::new(Mutex::new(PrometheusHandler { + encoder: BufferedTextEncoder::new(), + metrics, + })); + endpoint::make_router() - .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .get("/metrics", move |r| { + let state = state.clone(); + request_span(r, move |b| prometheus_metrics_handler(b, state)) + }) .get("/v1/status", status_handler) } -pub async fn task_main(http_listener: TcpListener) -> anyhow::Result { +pub async fn task_main( + http_listener: TcpListener, + metrics: AppMetrics, +) -> anyhow::Result { scopeguard::defer! { info!("http has shut down"); } - let service = || RouterService::new(make_router().build()?); + let service = || RouterService::new(make_router(metrics).build()?); hyper::Server::from_tcp(http_listener)? .serve(service().map_err(|e| anyhow!(e))?) @@ -32,3 +51,57 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result bail!("hyper server without shutdown handling cannot shutdown successfully"); } + +struct PrometheusHandler { + encoder: BufferedTextEncoder, + metrics: AppMetrics, +} + +#[derive(MetricGroup)] +pub struct AppMetrics { + #[metric(namespace = "jemalloc")] + pub jemalloc: Option, + #[metric(flatten)] + pub neon_metrics: NeonMetrics, + #[metric(flatten)] + pub proxy: &'static crate::metrics::Metrics, +} + +async fn prometheus_metrics_handler( + _req: Request, + state: Arc>, +) -> Result, ApiError> { + let started_at = std::time::Instant::now(); + + let span = info_span!("blocking"); + let body = tokio::task::spawn_blocking(move || { + let _span = span.entered(); + + let mut state = state.lock().unwrap(); + let PrometheusHandler { encoder, metrics } = &mut *state; + + metrics + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + + let body = encoder.finish(); + + tracing::info!( + bytes = body.len(), + elapsed_ms = started_at.elapsed().as_millis(), + "responded /metrics" + ); + + body + }) + .await + .unwrap(); + + let response = Response::builder() + .status(200) + .header(CONTENT_TYPE, "text/plain; version=0.0.4") + .body(Body::from(body)) + .unwrap(); + + Ok(response) +} diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index a6519bdff9..e38135dd22 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -160,6 +160,11 @@ impl From<&EndpointId> for EndpointIdInt { EndpointIdTag::get_interner().get_or_intern(value) } } +impl From for EndpointIdInt { + fn from(value: EndpointId) -> Self { + EndpointIdTag::get_interner().get_or_intern(&value) + } +} #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct BranchIdTag; @@ -175,6 +180,11 @@ impl From<&BranchId> for BranchIdInt { BranchIdTag::get_interner().get_or_intern(value) } } +impl From for BranchIdInt { + fn from(value: BranchId) -> Self { + BranchIdTag::get_interner().get_or_intern(&value) + } +} #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct ProjectIdTag; @@ -190,6 +200,11 @@ impl From<&ProjectId> for ProjectIdInt { ProjectIdTag::get_interner().get_or_intern(value) } } +impl From for ProjectIdInt { + fn from(value: ProjectId) -> Self { + ProjectIdTag::get_interner().get_or_intern(&value) + } +} #[cfg(test)] mod tests { diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs index ed20798d56..3243e6a140 100644 --- a/proxy/src/jemalloc.rs +++ b/proxy/src/jemalloc.rs @@ -1,27 +1,45 @@ -use std::time::Duration; +use std::marker::PhantomData; -use metrics::IntGauge; -use prometheus::{register_int_gauge_with_registry, Registry}; +use measured::{ + label::NoLabels, + metric::{ + gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder, + MetricEncoding, MetricFamilyEncoding, MetricType, + }, + text::TextEncoder, + LabelGroup, MetricGroup, +}; use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version}; pub struct MetricRecorder { epoch: epoch_mib, - active: stats::active_mib, - active_gauge: IntGauge, - allocated: stats::allocated_mib, - allocated_gauge: IntGauge, - mapped: stats::mapped_mib, - mapped_gauge: IntGauge, - metadata: stats::metadata_mib, - metadata_gauge: IntGauge, - resident: stats::resident_mib, - resident_gauge: IntGauge, - retained: stats::retained_mib, - retained_gauge: IntGauge, + inner: Metrics, +} + +#[derive(MetricGroup)] +struct Metrics { + active_bytes: JemallocGaugeFamily, + allocated_bytes: JemallocGaugeFamily, + mapped_bytes: JemallocGaugeFamily, + metadata_bytes: JemallocGaugeFamily, + resident_bytes: JemallocGaugeFamily, + retained_bytes: JemallocGaugeFamily, +} + +impl MetricGroup for MetricRecorder +where + Metrics: MetricGroup, +{ + fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> { + if self.epoch.advance().is_ok() { + self.inner.collect_group_into(enc)?; + } + Ok(()) + } } impl MetricRecorder { - pub fn new(registry: &Registry) -> Result { + pub fn new() -> Result { tracing::info!( config = config::malloc_conf::read()?, version = version::read()?, @@ -30,71 +48,69 @@ impl MetricRecorder { Ok(Self { epoch: epoch::mib()?, - active: stats::active::mib()?, - active_gauge: register_int_gauge_with_registry!( - "jemalloc_active_bytes", - "Total number of bytes in active pages allocated by the process", - registry - )?, - allocated: stats::allocated::mib()?, - allocated_gauge: register_int_gauge_with_registry!( - "jemalloc_allocated_bytes", - "Total number of bytes allocated by the process", - registry - )?, - mapped: stats::mapped::mib()?, - mapped_gauge: register_int_gauge_with_registry!( - "jemalloc_mapped_bytes", - "Total number of bytes in active extents mapped by the allocator", - registry - )?, - metadata: stats::metadata::mib()?, - metadata_gauge: register_int_gauge_with_registry!( - "jemalloc_metadata_bytes", - "Total number of bytes dedicated to jemalloc metadata", - registry - )?, - resident: stats::resident::mib()?, - resident_gauge: register_int_gauge_with_registry!( - "jemalloc_resident_bytes", - "Total number of bytes in physically resident data pages mapped by the allocator", - registry - )?, - retained: stats::retained::mib()?, - retained_gauge: register_int_gauge_with_registry!( - "jemalloc_retained_bytes", - "Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system", - registry - )?, - }) - } - - fn _poll(&self) -> Result<(), anyhow::Error> { - self.epoch.advance()?; - self.active_gauge.set(self.active.read()? as i64); - self.allocated_gauge.set(self.allocated.read()? as i64); - self.mapped_gauge.set(self.mapped.read()? as i64); - self.metadata_gauge.set(self.metadata.read()? as i64); - self.resident_gauge.set(self.resident.read()? as i64); - self.retained_gauge.set(self.retained.read()? as i64); - Ok(()) - } - - #[inline] - pub fn poll(&self) { - if let Err(error) = self._poll() { - tracing::warn!(%error, "Failed to poll jemalloc stats"); - } - } - - pub fn start(self) -> tokio::task::JoinHandle<()> { - tokio::task::spawn(async move { - let mut interval = tokio::time::interval(Duration::from_secs(15)); - interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - loop { - self.poll(); - interval.tick().await; - } + inner: Metrics { + active_bytes: JemallocGaugeFamily(stats::active::mib()?), + allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?), + mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?), + metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?), + resident_bytes: JemallocGaugeFamily(stats::resident::mib()?), + retained_bytes: JemallocGaugeFamily(stats::retained::mib()?), + }, }) } } + +struct JemallocGauge(PhantomData); + +impl Default for JemallocGauge { + fn default() -> Self { + JemallocGauge(PhantomData) + } +} +impl MetricType for JemallocGauge { + type Metadata = T; +} + +struct JemallocGaugeFamily(T); +impl MetricFamilyEncoding for JemallocGaugeFamily +where + JemallocGauge: MetricEncoding, +{ + fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> { + JemallocGauge::write_type(&name, enc)?; + JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc) + } +} + +macro_rules! jemalloc_gauge { + ($stat:ident, $mib:ident) => { + impl MetricEncoding> for JemallocGauge { + fn write_type( + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + GaugeState::write_type(name, enc) + } + + fn collect_into( + &self, + mib: &stats::$mib, + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + if let Ok(v) = mib.read() { + enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?; + } + Ok(()) + } + } + }; +} + +jemalloc_gauge!(active, active_mib); +jemalloc_gauge!(allocated, allocated_mib); +jemalloc_gauge!(mapped, mapped_mib); +jemalloc_gauge!(metadata, metadata_mib); +jemalloc_gauge!(resident, resident_mib); +jemalloc_gauge!(retained, retained_mib); diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index da7c7f3ed2..35c1616481 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -127,6 +127,24 @@ macro_rules! smol_str_wrapper { }; } +const POOLER_SUFFIX: &str = "-pooler"; + +pub trait Normalize { + fn normalize(&self) -> Self; +} + +impl + From> Normalize for S { + fn normalize(&self) -> Self { + if self.as_ref().ends_with(POOLER_SUFFIX) { + let mut s = self.as_ref().to_string(); + s.truncate(s.len() - POOLER_SUFFIX.len()); + s.into() + } else { + self.clone() + } + } +} + // 90% of role name strings are 20 characters or less. smol_str_wrapper!(RoleName); // 50% of endpoint strings are 23 characters or less. @@ -140,3 +158,25 @@ smol_str_wrapper!(ProjectId); smol_str_wrapper!(EndpointCacheKey); smol_str_wrapper!(DbName); + +// postgres hostname, will likely be a port:ip addr +smol_str_wrapper!(Host); + +// Endpoints are a bit tricky. Rare they might be branches or projects. +impl EndpointId { + pub fn is_endpoint(&self) -> bool { + self.0.starts_with("ep-") + } + pub fn is_branch(&self) -> bool { + self.0.starts_with("br-") + } + pub fn is_project(&self) -> bool { + !self.is_endpoint() && !self.is_branch() + } + pub fn as_branch(&self) -> BranchId { + BranchId(self.0.clone()) + } + pub fn as_project(&self) -> ProjectId { + ProjectId(self.0.clone()) + } +} diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 59ee899c08..1590316925 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,181 +1,372 @@ -use ::metrics::{ - exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec, - register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, - register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec, - IntCounterVec, IntGauge, IntGaugeVec, -}; -use metrics::{ - register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter, - IntCounterPair, -}; +use std::sync::OnceLock; + +use lasso::ThreadedRodeo; +use measured::{ + label::StaticLabelSet, + metric::{histogram::Thresholds, name::MetricName}, + Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, + MetricGroup, +}; +use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; -use once_cell::sync::Lazy; use tokio::time::{self, Instant}; use crate::console::messages::ColdStartInfo; -pub static NUM_DB_CONNECTIONS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_db_connections_total", - "Number of opened connections to a database.", - "proxy_closed_db_connections_total", - "Number of closed connections to a database.", - &["protocol"], - ) - .unwrap() -}); +#[derive(MetricGroup)] +pub struct Metrics { + #[metric(namespace = "proxy")] + pub proxy: ProxyMetrics, -pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_client_connections_total", - "Number of opened connections from a client.", - "proxy_closed_client_connections_total", - "Number of closed connections from a client.", - &["protocol"], - ) - .unwrap() -}); + #[metric(namespace = "wake_compute_lock")] + pub wake_compute_lock: ApiLockMetrics, +} -pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_accepted_connections_total", - "Number of client connections accepted.", - "proxy_closed_connections_total", - "Number of client connections closed.", - &["protocol"], - ) - .unwrap() -}); +impl Metrics { + pub fn get() -> &'static Self { + static SELF: OnceLock = OnceLock::new(); + SELF.get_or_init(|| Metrics { + proxy: ProxyMetrics::default(), + wake_compute_lock: ApiLockMetrics::new(), + }) + } +} -pub static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_compute_connection_latency_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane - // 3 * 6 * 2 * 2 = 72 counters - &["protocol", "cold_start_info", "outcome", "excluded"], - // largest bucket = 2^16 * 0.5ms = 32s - exponential_buckets(0.0005, 2.0, 16).unwrap(), - ) - .unwrap() -}); +#[derive(MetricGroup)] +#[metric(new())] +pub struct ProxyMetrics { + #[metric(flatten)] + pub db_connections: CounterPairVec, + #[metric(flatten)] + pub client_connections: CounterPairVec, + #[metric(flatten)] + pub connection_requests: CounterPairVec, + #[metric(flatten)] + pub http_endpoint_pools: HttpEndpointPools, -pub static CONSOLE_REQUEST_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_console_request_latency", - "Time it took for proxy to establish a connection to the compute endpoint", - // proxy_wake_compute/proxy_get_role_info - &["request"], + /// Time it took for proxy to establish a connection to the compute endpoint. + // largest bucket = 2^16 * 0.5ms = 32s + #[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))] + pub compute_connection_latency_seconds: HistogramVec, + + /// Time it took for proxy to receive a response from control plane. + #[metric( // largest bucket = 2^16 * 0.2ms = 13s - exponential_buckets(0.0002, 2.0, 16).unwrap(), - ) - .unwrap() -}); + metadata = Thresholds::exponential_buckets(0.0002, 2.0), + )] + pub console_request_latency: HistogramVec, -pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_allowed_ips_cache_misses", - "Number of cache hits/misses for allowed ips", - // hit/miss - &["outcome"], - ) - .unwrap() -}); + /// Time it takes to acquire a token to call console plane. + // largest bucket = 3^16 * 0.05ms = 2.15s + #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))] + pub control_plane_token_acquire_seconds: Histogram<16>, -pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_control_plane_token_acquire_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // largest bucket = 3^16 * 0.05ms = 2.15s - exponential_buckets(0.00005, 3.0, 16).unwrap(), - ) - .unwrap() -}); + /// Size of the HTTP request body lengths. + // smallest bucket = 16 bytes + // largest bucket = 4^12 * 16 bytes = 256MB + #[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))] + pub http_conn_content_length_bytes: HistogramVec, 12>, -pub static RATE_LIMITER_LIMIT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "semaphore_control_plane_limit", - "Current limit of the semaphore control plane", - &["limit"], // 2 counters - ) - .unwrap() -}); + /// Time it takes to reclaim unused connection pools. + #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] + pub http_pool_reclaimation_lag_seconds: Histogram<16>, -pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_accepted_connections_by_sni", - "Number of connections (per sni).", - &["kind"], - ) - .unwrap() -}); + /// Number of opened connections to a database. + pub http_pool_opened_connections: Gauge, -pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_allowed_ips_number", - "Number of allowed ips", - vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0], - ) - .unwrap() -}); + /// Number of cache hits/misses for allowed ips. + pub allowed_ips_cache_misses: CounterVec>, -pub static HTTP_CONTENT_LENGTH: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_http_conn_content_length_bytes", - "Number of bytes the HTTP response content consumes", - // request/response - &["direction"], - // smallest bucket = 16 bytes - // largest bucket = 4^12 * 16 bytes = 256MB - exponential_buckets(16.0, 4.0, 12).unwrap() - ) - .unwrap() -}); + /// Number of allowed ips + #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] + pub allowed_ips_number: Histogram<10>, -pub static GC_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_http_pool_reclaimation_lag_seconds", - "Time it takes to reclaim unused connection pools", - // 1us -> 65ms - exponential_buckets(1e-6, 2.0, 16).unwrap(), - ) - .unwrap() -}); + /// Number of connections (per sni). + pub accepted_connections_by_sni: CounterVec>, -pub static ENDPOINT_POOLS: Lazy = Lazy::new(|| { - register_int_counter_pair!( - "proxy_http_pool_endpoints_registered_total", - "Number of endpoints we have registered pools for", - "proxy_http_pool_endpoints_unregistered_total", - "Number of endpoints we have unregistered pools for", - ) - .unwrap() -}); + /// Number of connection failures (per kind). + pub connection_failures_total: CounterVec>, -pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy = Lazy::new(|| { - register_int_gauge!( - "proxy_http_pool_opened_connections", - "Number of opened connections to a database.", - ) - .unwrap() -}); + /// Number of wake-up failures (per kind). + pub connection_failures_breakdown: CounterVec, -pub static NUM_CANCELLATION_REQUESTS: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_cancellation_requests_total", - "Number of cancellation requests (per found/not_found).", - &["source", "kind"], - ) - .unwrap() -}); + /// Number of bytes sent/received between all clients and backends. + pub io_bytes: CounterVec>, -pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client"; -pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis"; + /// Number of errors by a given classification. + pub errors_total: CounterVec>, + + /// Number of cancellation requests (per found/not_found). + pub cancellation_requests_total: CounterVec, + + /// Number of errors by a given classification + pub redis_errors_total: CounterVec, + + /// Number of TLS handshake failures + pub tls_handshake_failures: Counter, + + /// Number of connection requests affected by authentication rate limits + pub requests_auth_rate_limits_total: Counter, + + /// HLL approximate cardinality of endpoints that are connecting + pub connecting_endpoints: HyperLogLogVec, 32>, + + /// Number of endpoints affected by errors of a given classification + pub endpoints_affected_by_errors: HyperLogLogVec, 32>, + + /// Number of endpoints affected by authentication rate limits + pub endpoints_auth_rate_limits: HyperLogLog<32>, + + /// Number of invalid endpoints (per protocol, per rejected). + pub invalid_endpoints_total: CounterVec, + + /// Number of retries (per outcome, per retry_type). + #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))] + pub retries_metric: HistogramVec, + + /// Number of events consumed from redis (per event type). + pub redis_events_count: CounterVec>, + + #[metric(namespace = "connect_compute_lock")] + pub connect_compute_lock: ApiLockMetrics, +} + +#[derive(MetricGroup)] +#[metric(new())] +pub struct ApiLockMetrics { + /// Number of semaphores registered in this api lock + pub semaphores_registered: Counter, + /// Number of semaphores unregistered in this api lock + pub semaphores_unregistered: Counter, + /// Time it takes to reclaim unused semaphores in the api lock + #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] + pub reclamation_lag_seconds: Histogram<16>, + /// Time it takes to acquire a semaphore lock + #[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))] + pub semaphore_acquire_seconds: Histogram<16>, +} + +impl Default for ProxyMetrics { + fn default() -> Self { + Self::new() + } +} + +impl Default for ApiLockMetrics { + fn default() -> Self { + Self::new() + } +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "direction")] +pub enum HttpDirection { + Request, + Response, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "direction")] +pub enum Direction { + Tx, + Rx, +} + +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +#[label(singleton = "protocol")] +pub enum Protocol { + Http, + Ws, + Tcp, + SniRouter, +} + +impl Protocol { + pub fn as_str(&self) -> &'static str { + match self { + Protocol::Http => "http", + Protocol::Ws => "ws", + Protocol::Tcp => "tcp", + Protocol::SniRouter => "sni_router", + } + } +} + +impl std::fmt::Display for Protocol { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum Bool { + True, + False, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "outcome")] +pub enum Outcome { + Success, + Failed, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "outcome")] +pub enum CacheOutcome { + Hit, + Miss, +} + +#[derive(LabelGroup)] +#[label(set = ConsoleRequestSet)] +pub struct ConsoleRequest<'a> { + #[label(dynamic_with = ThreadedRodeo, default)] + pub request: &'a str, +} + +#[derive(MetricGroup, Default)] +pub struct HttpEndpointPools { + /// Number of endpoints we have registered pools for + pub http_pool_endpoints_registered_total: Counter, + /// Number of endpoints we have unregistered pools for + pub http_pool_endpoints_unregistered_total: Counter, +} + +pub struct HttpEndpointPoolsGuard<'a> { + dec: &'a Counter, +} + +impl Drop for HttpEndpointPoolsGuard<'_> { + fn drop(&mut self) { + self.dec.inc(); + } +} + +impl HttpEndpointPools { + pub fn guard(&self) -> HttpEndpointPoolsGuard { + self.http_pool_endpoints_registered_total.inc(); + HttpEndpointPoolsGuard { + dec: &self.http_pool_endpoints_unregistered_total, + } + } +} +pub struct NumDbConnectionsGauge; +impl CounterPairAssoc for NumDbConnectionsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total"); + const INC_HELP: &'static str = "Number of opened connections to a database."; + const DEC_HELP: &'static str = "Number of closed connections to a database."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>; + +pub struct NumClientConnectionsGauge; +impl CounterPairAssoc for NumClientConnectionsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total"); + const INC_HELP: &'static str = "Number of opened connections from a client."; + const DEC_HELP: &'static str = "Number of closed connections from a client."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumClientConnectionsGuard<'a> = + metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>; + +pub struct NumConnectionRequestsGauge; +impl CounterPairAssoc for NumConnectionRequestsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total"); + const INC_HELP: &'static str = "Number of client connections accepted."; + const DEC_HELP: &'static str = "Number of client connections closed."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumConnectionRequestsGuard<'a> = + metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>; + +#[derive(LabelGroup)] +#[label(set = ComputeConnectionLatencySet)] +pub struct ComputeConnectionLatencyGroup { + protocol: Protocol, + cold_start_info: ColdStartInfo, + outcome: ConnectOutcome, + excluded: LatencyExclusions, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum LatencyExclusions { + Client, + ClientAndCplane, + ClientCplaneCompute, + ClientCplaneComputeRetry, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum SniKind { + Sni, + NoSni, + PasswordHack, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum ConnectionFailureKind { + ComputeCached, + ComputeUncached, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum WakeupFailureKind { + BadComputeAddress, + ApiTransportError, + QuotaExceeded, + ApiConsoleLocked, + ApiConsoleBadRequest, + ApiConsoleOtherServerError, + ApiConsoleOtherError, + TimeoutError, +} + +#[derive(LabelGroup)] +#[label(set = ConnectionFailuresBreakdownSet)] +pub struct ConnectionFailuresBreakdownGroup { + pub kind: WakeupFailureKind, + pub retry: Bool, +} + +#[derive(LabelGroup, Copy, Clone)] +#[label(set = RedisErrorsSet)] +pub struct RedisErrors<'a> { + #[label(dynamic_with = ThreadedRodeo, default)] + pub channel: &'a str, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum CancellationSource { + FromClient, + FromRedis, + Local, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum CancellationOutcome { + NotFound, + Found, +} + +#[derive(LabelGroup)] +#[label(set = CancellationRequestSet)] +pub struct CancellationRequest { + pub source: CancellationSource, + pub kind: CancellationOutcome, +} pub enum Waiting { Cplane, Client, Compute, + RetryTimeout, } #[derive(Default)] @@ -183,20 +374,7 @@ struct Accumulated { cplane: time::Duration, client: time::Duration, compute: time::Duration, -} - -enum Outcome { - Success, - Failed, -} - -impl Outcome { - fn as_str(&self) -> &'static str { - match self { - Outcome::Success => "success", - Outcome::Failed => "failed", - } - } + retry: time::Duration, } pub struct LatencyTimer { @@ -207,9 +385,9 @@ pub struct LatencyTimer { // accumulated time on the stopwatch accumulated: Accumulated, // label data - protocol: &'static str, + protocol: Protocol, cold_start_info: ColdStartInfo, - outcome: Outcome, + outcome: ConnectOutcome, } pub struct LatencyTimerPause<'a> { @@ -219,7 +397,7 @@ pub struct LatencyTimerPause<'a> { } impl LatencyTimer { - pub fn new(protocol: &'static str) -> Self { + pub fn new(protocol: Protocol) -> Self { Self { start: time::Instant::now(), stop: None, @@ -227,7 +405,7 @@ impl LatencyTimer { protocol, cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified - outcome: Outcome::Failed, + outcome: ConnectOutcome::Failed, } } @@ -248,7 +426,7 @@ impl LatencyTimer { self.stop = Some(time::Instant::now()); // success - self.outcome = Outcome::Success; + self.outcome = ConnectOutcome::Success; } } @@ -259,132 +437,119 @@ impl Drop for LatencyTimerPause<'_> { Waiting::Cplane => self.timer.accumulated.cplane += dur, Waiting::Client => self.timer.accumulated.client += dur, Waiting::Compute => self.timer.accumulated.compute += dur, + Waiting::RetryTimeout => self.timer.accumulated.retry += dur, } } } +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +pub enum ConnectOutcome { + Success, + Failed, +} + impl Drop for LatencyTimer { fn drop(&mut self) { let duration = self .stop .unwrap_or_else(time::Instant::now) .duration_since(self.start); - // Excluding cplane communication from the accumulated time. - COMPUTE_CONNECTION_LATENCY - .with_label_values(&[ - self.protocol, - self.cold_start_info.as_str(), - self.outcome.as_str(), - "client", - ]) - .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64()); + + let metric = &Metrics::get().proxy.compute_connection_latency_seconds; + + // Excluding client communication from the accumulated time. + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::Client, + }, + duration + .saturating_sub(self.accumulated.client) + .as_secs_f64(), + ); + // Exclude client and cplane communication from the accumulated time. let accumulated_total = self.accumulated.client + self.accumulated.cplane; - COMPUTE_CONNECTION_LATENCY - .with_label_values(&[ - self.protocol, - self.cold_start_info.as_str(), - self.outcome.as_str(), - "client_and_cplane", - ]) - .observe((duration.saturating_sub(accumulated_total)).as_secs_f64()); + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientAndCplane, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); + + // Exclude client cplane, compue communication from the accumulated time. + let accumulated_total = + self.accumulated.client + self.accumulated.cplane + self.accumulated.compute; + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientCplaneCompute, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); + + // Exclude client cplane, compue, retry communication from the accumulated time. + let accumulated_total = self.accumulated.client + + self.accumulated.cplane + + self.accumulated.compute + + self.accumulated.retry; + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientCplaneComputeRetry, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); } } -pub static NUM_CONNECTION_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_total", - "Number of connection failures (per kind).", - &["kind"], - ) - .unwrap() -}); - -pub static NUM_WAKEUP_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_breakdown", - "Number of wake-up failures (per kind).", - &["retry", "kind"], - ) - .unwrap() -}); - -pub static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_io_bytes", - "Number of bytes sent/received between all clients and backends.", - &["direction"], - ) - .unwrap() -}); - -pub const fn bool_to_str(x: bool) -> &'static str { - if x { - "true" - } else { - "false" +impl From for Bool { + fn from(value: bool) -> Self { + if value { + Bool::True + } else { + Bool::False + } } } -pub static CONNECTING_ENDPOINTS: Lazy> = Lazy::new(|| { - register_hll_vec!( - 32, - "proxy_connecting_endpoints", - "HLL approximate cardinality of endpoints that are connecting", - &["protocol"], - ) - .unwrap() -}); +#[derive(LabelGroup)] +#[label(set = InvalidEndpointsSet)] +pub struct InvalidEndpointsGroup { + pub protocol: Protocol, + pub rejected: Bool, + pub outcome: ConnectOutcome, +} -pub static ERROR_BY_KIND: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_errors_total", - "Number of errors by a given classification", - &["type"], - ) - .unwrap() -}); +#[derive(LabelGroup)] +#[label(set = RetriesMetricSet)] +pub struct RetriesMetricGroup { + pub outcome: ConnectOutcome, + pub retry_type: RetryType, +} -pub static ENDPOINT_ERRORS_BY_KIND: Lazy> = Lazy::new(|| { - register_hll_vec!( - 32, - "proxy_endpoints_affected_by_errors", - "Number of endpoints affected by errors of a given classification", - &["type"], - ) - .unwrap() -}); +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +pub enum RetryType { + WakeCompute, + ConnectToCompute, +} -pub static REDIS_BROKEN_MESSAGES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_redis_errors_total", - "Number of errors by a given classification", - &["channel"], - ) - .unwrap() -}); - -pub static TLS_HANDSHAKE_FAILURES: Lazy = Lazy::new(|| { - register_int_counter!( - "proxy_tls_handshake_failures", - "Number of TLS handshake failures", - ) - .unwrap() -}); - -pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy> = Lazy::new(|| { - register_hll!( - 32, - "proxy_endpoints_auth_rate_limits", - "Number of endpoints affected by authentication rate limits", - ) - .unwrap() -}); - -pub static AUTH_RATE_LIMIT_HITS: Lazy = Lazy::new(|| { - register_int_counter!( - "proxy_requests_auth_rate_limits_total", - "Number of connection requests affected by authentication rate limits", - ) - .unwrap() -}); +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +#[label(singleton = "event")] +pub enum RedisEventsCount { + EndpointCreated, + BranchCreated, + ProjectCreated, + CancelSession, + PasswordUpdate, + AllowedIpsUpdate, +} diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 700c8c8681..1dd4563514 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -1,48 +1,26 @@ //! Proxy Protocol V2 implementation use std::{ - future::{poll_fn, Future}, io, net::SocketAddr, - pin::{pin, Pin}, - sync::Mutex, - task::{ready, Context, Poll}, + pin::Pin, + task::{Context, Poll}, }; -use bytes::{Buf, BytesMut}; -use hyper::server::accept::Accept; -use hyper::server::conn::{AddrIncoming, AddrStream}; -use metrics::IntCounterPairGuard; +use bytes::BytesMut; use pin_project_lite::pin_project; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; -use uuid::Uuid; - -use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE; - -pub struct ProxyProtocolAccept { - pub incoming: AddrIncoming, - pub protocol: &'static str, -} pin_project! { - pub struct WithClientIp { + /// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough + pub struct ChainRW { #[pin] pub inner: T, buf: BytesMut, - tlv_bytes: u16, - state: ProxyParse, } } -#[derive(Clone, PartialEq, Debug)] -enum ProxyParse { - NotStarted, - - Finished(SocketAddr), - None, -} - -impl AsyncWrite for WithClientIp { +impl AsyncWrite for ChainRW { #[inline] fn poll_write( self: Pin<&mut Self>, @@ -77,364 +55,174 @@ impl AsyncWrite for WithClientIp { } } -impl WithClientIp { - pub fn new(inner: T) -> Self { - WithClientIp { - inner, - buf: BytesMut::with_capacity(128), - tlv_bytes: 0, - state: ProxyParse::NotStarted, - } - } - - pub fn client_addr(&self) -> Option { - match self.state { - ProxyParse::Finished(socket) => Some(socket), - _ => None, - } - } -} - -impl WithClientIp { - pub async fn wait_for_addr(&mut self) -> io::Result> { - match self.state { - ProxyParse::NotStarted => { - let mut pin = Pin::new(&mut *self); - let addr = poll_fn(|cx| pin.as_mut().poll_client_ip(cx)).await?; - match addr { - Some(addr) => self.state = ProxyParse::Finished(addr), - None => self.state = ProxyParse::None, - } - Ok(addr) - } - ProxyParse::Finished(addr) => Ok(Some(addr)), - ProxyParse::None => Ok(None), - } - } -} - /// Proxy Protocol Version 2 Header const HEADER: [u8; 12] = [ 0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A, ]; -impl WithClientIp { - /// implementation of - /// Version 2 (Binary Format) - fn poll_client_ip( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - // The binary header format starts with a constant 12 bytes block containing the protocol signature : - // \x0D \x0A \x0D \x0A \x00 \x0D \x0A \x51 \x55 \x49 \x54 \x0A - while self.buf.len() < 16 { - let mut this = self.as_mut().project(); - let bytes_read = pin!(this.inner.read_buf(this.buf)).poll(cx)?; +pub async fn read_proxy_protocol( + mut read: T, +) -> std::io::Result<(ChainRW, Option)> { + let mut buf = BytesMut::with_capacity(128); + while buf.len() < 16 { + let bytes_read = read.read_buf(&mut buf).await?; - // exit for bad header - let len = usize::min(self.buf.len(), HEADER.len()); - if self.buf[..len] != HEADER[..len] { - return Poll::Ready(Ok(None)); - } - - // if no more bytes available then exit - if ready!(bytes_read) == 0 { - return Poll::Ready(Ok(None)); - }; + // exit for bad header + let len = usize::min(buf.len(), HEADER.len()); + if buf[..len] != HEADER[..len] { + return Ok((ChainRW { inner: read, buf }, None)); } - // The next byte (the 13th one) is the protocol version and command. - // The highest four bits contains the version. As of this specification, it must - // always be sent as \x2 and the receiver must only accept this value. - let vc = self.buf[12]; - let version = vc >> 4; - let command = vc & 0b1111; - if version != 2 { - return Poll::Ready(Err(io::Error::new( + // if no more bytes available then exit + if bytes_read == 0 { + return Ok((ChainRW { inner: read, buf }, None)); + }; + } + + let header = buf.split_to(16); + + // The next byte (the 13th one) is the protocol version and command. + // The highest four bits contains the version. As of this specification, it must + // always be sent as \x2 and the receiver must only accept this value. + let vc = header[12]; + let version = vc >> 4; + let command = vc & 0b1111; + if version != 2 { + return Err(io::Error::new( + io::ErrorKind::Other, + "invalid proxy protocol version. expected version 2", + )); + } + match command { + // the connection was established on purpose by the proxy + // without being relayed. The connection endpoints are the sender and the + // receiver. Such connections exist when the proxy sends health-checks to the + // server. The receiver must accept this connection as valid and must use the + // real connection endpoints and discard the protocol block including the + // family which is ignored. + 0 => {} + // the connection was established on behalf of another node, + // and reflects the original connection endpoints. The receiver must then use + // the information provided in the protocol block to get original the address. + 1 => {} + // other values are unassigned and must not be emitted by senders. Receivers + // must drop connections presenting unexpected values here. + _ => { + return Err(io::Error::new( io::ErrorKind::Other, - "invalid proxy protocol version. expected version 2", - ))); + "invalid proxy protocol command. expected local (0) or proxy (1)", + )) } - match command { - // the connection was established on purpose by the proxy - // without being relayed. The connection endpoints are the sender and the - // receiver. Such connections exist when the proxy sends health-checks to the - // server. The receiver must accept this connection as valid and must use the - // real connection endpoints and discard the protocol block including the - // family which is ignored. - 0 => {} - // the connection was established on behalf of another node, - // and reflects the original connection endpoints. The receiver must then use - // the information provided in the protocol block to get original the address. - 1 => {} - // other values are unassigned and must not be emitted by senders. Receivers - // must drop connections presenting unexpected values here. - _ => { - return Poll::Ready(Err(io::Error::new( - io::ErrorKind::Other, - "invalid proxy protocol command. expected local (0) or proxy (1)", - ))) - } - }; + }; - // The 14th byte contains the transport protocol and address family. The highest 4 - // bits contain the address family, the lowest 4 bits contain the protocol. - let ft = self.buf[13]; - let address_length = match ft { - // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET - // protocol family. Address length is 2*4 + 2*2 = 12 bytes. - // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET - // protocol family. Address length is 2*4 + 2*2 = 12 bytes. - 0x11 | 0x12 => 12, - // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6 - // protocol family. Address length is 2*16 + 2*2 = 36 bytes. - // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6 - // protocol family. Address length is 2*16 + 2*2 = 36 bytes. - 0x21 | 0x22 => 36, - // unspecified or unix stream. ignore the addresses - _ => 0, - }; + // The 14th byte contains the transport protocol and address family. The highest 4 + // bits contain the address family, the lowest 4 bits contain the protocol. + let ft = header[13]; + let address_length = match ft { + // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET + // protocol family. Address length is 2*4 + 2*2 = 12 bytes. + // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET + // protocol family. Address length is 2*4 + 2*2 = 12 bytes. + 0x11 | 0x12 => 12, + // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6 + // protocol family. Address length is 2*16 + 2*2 = 36 bytes. + // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6 + // protocol family. Address length is 2*16 + 2*2 = 36 bytes. + 0x21 | 0x22 => 36, + // unspecified or unix stream. ignore the addresses + _ => 0, + }; - // The 15th and 16th bytes is the address length in bytes in network endian order. - // It is used so that the receiver knows how many address bytes to skip even when - // it does not implement the presented protocol. Thus the length of the protocol - // header in bytes is always exactly 16 + this value. When a sender presents a - // LOCAL connection, it should not present any address so it sets this field to - // zero. Receivers MUST always consider this field to skip the appropriate number - // of bytes and must not assume zero is presented for LOCAL connections. When a - // receiver accepts an incoming connection showing an UNSPEC address family or - // protocol, it may or may not decide to log the address information if present. - let remaining_length = u16::from_be_bytes(self.buf[14..16].try_into().unwrap()); - if remaining_length < address_length { - return Poll::Ready(Err(io::Error::new( - io::ErrorKind::Other, - "invalid proxy protocol length. not enough to fit requested IP addresses", - ))); + // The 15th and 16th bytes is the address length in bytes in network endian order. + // It is used so that the receiver knows how many address bytes to skip even when + // it does not implement the presented protocol. Thus the length of the protocol + // header in bytes is always exactly 16 + this value. When a sender presents a + // LOCAL connection, it should not present any address so it sets this field to + // zero. Receivers MUST always consider this field to skip the appropriate number + // of bytes and must not assume zero is presented for LOCAL connections. When a + // receiver accepts an incoming connection showing an UNSPEC address family or + // protocol, it may or may not decide to log the address information if present. + let remaining_length = u16::from_be_bytes(header[14..16].try_into().unwrap()); + if remaining_length < address_length { + return Err(io::Error::new( + io::ErrorKind::Other, + "invalid proxy protocol length. not enough to fit requested IP addresses", + )); + } + drop(header); + + while buf.len() < remaining_length as usize { + if read.read_buf(&mut buf).await? == 0 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "stream closed while waiting for proxy protocol addresses", + )); } - - while self.buf.len() < 16 + address_length as usize { - let mut this = self.as_mut().project(); - if ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?) == 0 { - return Poll::Ready(Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "stream closed while waiting for proxy protocol addresses", - ))); - } - } - - let this = self.as_mut().project(); - - // we are sure this is a proxy protocol v2 entry and we have read all the bytes we need - // discard the header we have parsed - this.buf.advance(16); - - // Starting from the 17th byte, addresses are presented in network byte order. - // The address order is always the same : - // - source layer 3 address in network byte order - // - destination layer 3 address in network byte order - // - source layer 4 address if any, in network byte order (port) - // - destination layer 4 address if any, in network byte order (port) - let addresses = this.buf.split_to(address_length as usize); - let socket = match address_length { - 12 => { - let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap(); - let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap()); - Some(SocketAddr::from((src_addr, src_port))) - } - 36 => { - let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap(); - let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap()); - Some(SocketAddr::from((src_addr, src_port))) - } - _ => None, - }; - - *this.tlv_bytes = remaining_length - address_length; - self.as_mut().skip_tlv_inner(); - - Poll::Ready(Ok(socket)) } - #[cold] - fn read_ip(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let ip = ready!(self.as_mut().poll_client_ip(cx)?); - match ip { - Some(x) => *self.as_mut().project().state = ProxyParse::Finished(x), - None => *self.as_mut().project().state = ProxyParse::None, + // Starting from the 17th byte, addresses are presented in network byte order. + // The address order is always the same : + // - source layer 3 address in network byte order + // - destination layer 3 address in network byte order + // - source layer 4 address if any, in network byte order (port) + // - destination layer 4 address if any, in network byte order (port) + let addresses = buf.split_to(remaining_length as usize); + let socket = match address_length { + 12 => { + let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap(); + let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap()); + Some(SocketAddr::from((src_addr, src_port))) } - Poll::Ready(Ok(())) - } + 36 => { + let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap(); + let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap()); + Some(SocketAddr::from((src_addr, src_port))) + } + _ => None, + }; - #[cold] - fn skip_tlv(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let mut this = self.as_mut().project(); - // we know that this.buf is empty - debug_assert_eq!(this.buf.len(), 0); - - this.buf.reserve((*this.tlv_bytes).clamp(0, 1024) as usize); - ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?); - self.skip_tlv_inner(); - - Poll::Ready(Ok(())) - } - - fn skip_tlv_inner(self: Pin<&mut Self>) { - let tlv_bytes_read = match u16::try_from(self.buf.len()) { - // we read more than u16::MAX therefore we must have read the full tlv_bytes - Err(_) => self.tlv_bytes, - // we might not have read the full tlv bytes yet - Ok(n) => u16::min(n, self.tlv_bytes), - }; - let this = self.project(); - *this.tlv_bytes -= tlv_bytes_read; - this.buf.advance(tlv_bytes_read as usize); - } + Ok((ChainRW { inner: read, buf }, socket)) } -impl AsyncRead for WithClientIp { +impl AsyncRead for ChainRW { #[inline] fn poll_read( - mut self: Pin<&mut Self>, + self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &mut ReadBuf<'_>, ) -> Poll> { - // I'm assuming these 3 comparisons will be easy to branch predict. - // especially with the cold attributes - // which should make this read wrapper almost invisible - - if let ProxyParse::NotStarted = self.state { - ready!(self.as_mut().read_ip(cx)?); - } - - while self.tlv_bytes > 0 { - ready!(self.as_mut().skip_tlv(cx)?) - } - - let this = self.project(); - if this.buf.is_empty() { - this.inner.poll_read(cx, buf) + if self.buf.is_empty() { + self.project().inner.poll_read(cx, buf) } else { - // we know that tlv_bytes is 0 - debug_assert_eq!(*this.tlv_bytes, 0); - - let write = usize::min(this.buf.len(), buf.remaining()); - let slice = this.buf.split_to(write).freeze(); - buf.put_slice(&slice); - - // reset the allocation so it can be freed - if this.buf.is_empty() { - *this.buf = BytesMut::new(); - } - - Poll::Ready(Ok(())) + self.read_from_buf(buf) } } } -impl Accept for ProxyProtocolAccept { - type Conn = WithConnectionGuard>; +impl ChainRW { + #[cold] + fn read_from_buf(self: Pin<&mut Self>, buf: &mut ReadBuf<'_>) -> Poll> { + debug_assert!(!self.buf.is_empty()); + let this = self.project(); - type Error = io::Error; + let write = usize::min(this.buf.len(), buf.remaining()); + let slice = this.buf.split_to(write).freeze(); + buf.put_slice(&slice); - fn poll_accept( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?); - - let conn_id = uuid::Uuid::new_v4(); - let span = tracing::info_span!("http_conn", ?conn_id); - { - let _enter = span.enter(); - tracing::info!("accepted new TCP connection"); + // reset the allocation so it can be freed + if this.buf.is_empty() { + *this.buf = BytesMut::new(); } - let Some(conn) = conn else { - return Poll::Ready(None); - }; - - Poll::Ready(Some(Ok(WithConnectionGuard { - inner: WithClientIp::new(conn), - connection_id: Uuid::new_v4(), - gauge: Mutex::new(Some( - NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&[self.protocol]) - .guard(), - )), - span, - }))) - } -} - -pin_project! { - pub struct WithConnectionGuard { - #[pin] - pub inner: T, - pub connection_id: Uuid, - pub gauge: Mutex>, - pub span: tracing::Span, - } - - impl PinnedDrop for WithConnectionGuard { - fn drop(this: Pin<&mut Self>) { - let _enter = this.span.enter(); - tracing::info!("HTTP connection closed") - } - } -} - -impl AsyncWrite for WithConnectionGuard { - #[inline] - fn poll_write( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &[u8], - ) -> Poll> { - self.project().inner.poll_write(cx, buf) - } - - #[inline] - fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_flush(cx) - } - - #[inline] - fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_shutdown(cx) - } - - #[inline] - fn poll_write_vectored( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - bufs: &[io::IoSlice<'_>], - ) -> Poll> { - self.project().inner.poll_write_vectored(cx, bufs) - } - - #[inline] - fn is_write_vectored(&self) -> bool { - self.inner.is_write_vectored() - } -} - -impl AsyncRead for WithConnectionGuard { - fn poll_read( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &mut ReadBuf<'_>, - ) -> Poll> { - self.project().inner.poll_read(cx, buf) + Poll::Ready(Ok(())) } } #[cfg(test)] mod tests { - use std::pin::pin; - use tokio::io::AsyncReadExt; - use crate::protocol2::{ProxyParse, WithClientIp}; + use crate::protocol2::read_proxy_protocol; #[tokio::test] async fn test_ipv4() { @@ -456,16 +244,15 @@ mod tests { let extra_data = [0x55; 256]; - let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice())) + .await + .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); - assert_eq!( - read.state, - ProxyParse::Finished(([127, 0, 0, 1], 65535).into()) - ); + assert_eq!(addr, Some(([127, 0, 0, 1], 65535).into())); } #[tokio::test] @@ -488,17 +275,17 @@ mod tests { let extra_data = [0x55; 256]; - let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice())) + .await + .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); assert_eq!( - read.state, - ProxyParse::Finished( - ([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into() - ) + addr, + Some(([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into()) ); } @@ -506,24 +293,24 @@ mod tests { async fn test_invalid() { let data = [0x55; 256]; - let mut read = pin!(WithClientIp::new(data.as_slice())); + let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, data); - assert_eq!(read.state, ProxyParse::None); + assert_eq!(addr, None); } #[tokio::test] async fn test_short() { let data = [0x55; 10]; - let mut read = pin!(WithClientIp::new(data.as_slice())); + let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, data); - assert_eq!(read.state, ProxyParse::None); + assert_eq!(addr, None); } #[tokio::test] @@ -549,15 +336,14 @@ mod tests { let extra_data = [0xaa; 256]; - let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice())) + .await + .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); - assert_eq!( - read.state, - ProxyParse::Finished(([55, 56, 57, 58], 65535).into()) - ); + assert_eq!(addr, Some(([55, 56, 57, 58], 65535).into())); } } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 6051c0a812..e4e095d77d 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -7,6 +7,7 @@ pub mod handshake; pub mod passthrough; pub mod retry; pub mod wake_compute; +pub use copy_bidirectional::copy_bidirectional_client_compute; use crate::{ auth, @@ -15,16 +16,14 @@ use crate::{ config::{ProxyConfig, TlsConfig}, context::RequestMonitoring, error::ReportableError, - metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE}, - protocol2::WithClientIp, + metrics::{Metrics, NumClientConnectionsGuard}, + protocol2::read_proxy_protocol, proxy::handshake::{handshake, HandshakeData}, - rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, EndpointCacheKey, }; use futures::TryFutureExt; use itertools::Itertools; -use metrics::IntCounterPairGuard; use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, StartupMessageParams}; use regex::Regex; @@ -61,7 +60,6 @@ pub async fn task_main( config: &'static ProxyConfig, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, - endpoint_rate_limiter: Arc, cancellation_handler: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { @@ -79,31 +77,29 @@ pub async fn task_main( { let (socket, peer_addr) = accept_result?; - let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["tcp"]) - .guard(); + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Tcp); let session_id = uuid::Uuid::new_v4(); let cancellation_handler = Arc::clone(&cancellation_handler); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection"); connections.spawn(async move { - let mut socket = WithClientIp::new(socket); - let mut peer_addr = peer_addr.ip(); - match socket.wait_for_addr().await { - Ok(Some(addr)) => peer_addr = addr.ip(), + let (socket, peer_addr) = match read_proxy_protocol(socket).await{ + Ok((socket, Some(addr))) => (socket, addr.ip()), Err(e) => { error!("per-client task finished with an error: {e:#}"); return; } - Ok(None) if config.require_client_ip => { + Ok((_socket, None)) if config.require_client_ip => { error!("missing required client IP"); return; } - Ok(None) => {} - } + Ok((socket, None)) => (socket, peer_addr.ip()) + }; match socket.inner.set_nodelay(true) { Ok(()) => {}, @@ -113,7 +109,12 @@ pub async fn task_main( }, }; - let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region); + let mut ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Tcp, + &config.region, + ); let span = ctx.span.clone(); let res = handle_client( @@ -122,7 +123,6 @@ pub async fn task_main( cancellation_handler, socket, ClientMode::Tcp, - endpoint_rate_limiter, conn_gauge, ) .instrument(span.clone()) @@ -132,16 +132,14 @@ pub async fn task_main( Err(e) => { // todo: log and push to ctx the error kind ctx.set_error_kind(e.get_error_kind()); - ctx.log(); error!(parent: &span, "per-client task finished with an error: {e:#}"); } Ok(None) => { ctx.set_success(); - ctx.log(); } Ok(Some(p)) => { ctx.set_success(); - ctx.log(); + ctx.log_connect(); match p.proxy_pass().instrument(span.clone()).await { Ok(()) => {} Err(e) => { @@ -236,20 +234,23 @@ pub async fn handle_client( cancellation_handler: Arc, stream: S, mode: ClientMode, - endpoint_rate_limiter: Arc, - conn_gauge: IntCounterPairGuard, + conn_gauge: NumClientConnectionsGuard<'static>, ) -> Result>, ClientRequestError> { - info!("handling interactive connection from client"); + info!( + protocol = %ctx.protocol, + "handling interactive connection from client" + ); + let metrics = &Metrics::get().proxy; let proto = ctx.protocol; - let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&[proto]) - .guard(); + // let _client_gauge = metrics.client_connections.guard(proto); + let _request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); + let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client); - let do_handshake = handshake(stream, mode.handshake_tls(tls)); + let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error); let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), @@ -278,15 +279,6 @@ pub async fn handle_client( Err(e) => stream.throw_error(e).await?, }; - // check rate limit - if let Some(ep) = user_info.get_endpoint() { - if !endpoint_rate_limiter.check(ep, 1) { - return stream - .throw_error(auth::AuthError::too_many_connections()) - .await?; - } - } - let user = user_info.get_user().to_owned(); let user_info = match user_info .authenticate( @@ -309,9 +301,14 @@ pub async fn handle_client( let mut node = connect_to_compute( ctx, - &TcpMechanism { params: ¶ms }, + &TcpMechanism { + params: ¶ms, + locks: &config.connect_compute_locks, + }, &user_info, mode.allow_self_signed_compute(config), + config.wake_compute_retry_config, + config.connect_to_compute_retry_config, ) .or_else(|e| stream.throw_error(e)) .await?; diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 4c0d68ce0b..c8528d0296 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -1,14 +1,16 @@ use crate::{ auth::backend::ComputeCredentialKeys, compute::{self, PostgresConnection}, - console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo}, + config::RetryConfig, + console::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo}, context::RequestMonitoring, error::ReportableError, - metrics::NUM_CONNECTION_FAILURES, + metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType}, proxy::{ retry::{retry_after, ShouldRetry}, wake_compute::wake_compute, }, + Host, }; use async_trait::async_trait; use pq_proto::StartupMessageParams; @@ -27,10 +29,10 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo { warn!("invalidating stalled compute node info cache entry"); } let label = match is_cached { - true => "compute_cached", - false => "compute_uncached", + true => ConnectionFailureKind::ComputeCached, + false => ConnectionFailureKind::ComputeUncached, }; - NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); + Metrics::get().proxy.connection_failures_total.inc(label); node_info.invalidate() } @@ -63,6 +65,9 @@ pub trait ComputeConnectBackend { pub struct TcpMechanism<'a> { /// KV-dictionary with PostgreSQL connection params. pub params: &'a StartupMessageParams, + + /// connect_to_compute concurrency lock + pub locks: &'static ApiLocks, } #[async_trait] @@ -78,6 +83,8 @@ impl ConnectMechanism for TcpMechanism<'_> { node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result { + let host = node_info.config.get_host()?; + let _permit = self.locks.get_permit(&host).await?; node_info.connect(ctx, timeout).await } @@ -93,19 +100,23 @@ pub async fn connect_to_compute( mechanism: &M, user_info: &B, allow_self_signed_compute: bool, + wake_compute_retry_config: RetryConfig, + connect_to_compute_retry_config: RetryConfig, ) -> Result where M::ConnectError: ShouldRetry + std::fmt::Debug, M::Error: From, { let mut num_retries = 0; - let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?; + let mut node_info = + wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; if let Some(keys) = user_info.get_keys() { node_info.set_keys(keys); } node_info.allow_self_signed_compute = allow_self_signed_compute; // let mut node_info = credentials.get_node_info(ctx, user_info).await?; mechanism.update_connect_config(&mut node_info.config); + let retry_type = RetryType::ConnectToCompute; // try once let err = match mechanism @@ -114,6 +125,13 @@ where { Ok(res) => { ctx.latency_timer.success(); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Success, + retry_type, + }, + num_retries.into(), + ); return Ok(res); } Err(e) => e, @@ -121,10 +139,17 @@ where error!(error = ?err, "could not connect to compute node"); - let node_info = if !node_info.cached() { + let node_info = if !node_info.cached() || !err.should_retry_database_address() { // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry. // Do not need to retrieve a new node_info, just return the old one. - if !err.should_retry(num_retries) { + if !err.should_retry(num_retries, connect_to_compute_retry_config) { + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Failed, + retry_type, + }, + num_retries.into(), + ); return Err(err.into()); } node_info @@ -132,7 +157,8 @@ where // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node info!("compute node's state has likely changed; requesting a wake-up"); let old_node_info = invalidate_cache(node_info); - let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?; + let mut node_info = + wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; node_info.reuse_settings(old_node_info); mechanism.update_connect_config(&mut node_info.config); @@ -151,21 +177,40 @@ where { Ok(res) => { ctx.latency_timer.success(); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Success, + retry_type, + }, + num_retries.into(), + ); + info!(?num_retries, "connected to compute node after"); return Ok(res); } Err(e) => { - let retriable = e.should_retry(num_retries); + let retriable = e.should_retry(num_retries, connect_to_compute_retry_config); if !retriable { error!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Failed, + retry_type, + }, + num_retries.into(), + ); return Err(e.into()); } warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); } } - let wait_duration = retry_after(num_retries); + let wait_duration = retry_after(num_retries, connect_to_compute_retry_config); num_retries += 1; + let pause = ctx + .latency_timer + .pause(crate::metrics::Waiting::RetryTimeout); time::sleep(wait_duration).await; + drop(pause); } } diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 684be74f9a..4b09ebd8dc 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -41,7 +41,7 @@ where } #[tracing::instrument(skip_all)] -pub(super) async fn copy_bidirectional_client_compute( +pub async fn copy_bidirectional_client_compute( client: &mut Client, compute: &mut Compute, ) -> Result<(u64, u64), std::io::Error> diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index 4665e07d23..dd935cc245 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -63,6 +63,7 @@ pub enum HandshakeData { pub async fn handshake( stream: S, mut tls: Option<&TlsConfig>, + record_handshake_error: bool, ) -> Result, HandshakeError> { // Client may try upgrading to each protocol only once let (mut tried_ssl, mut tried_gss) = (false, false); @@ -95,7 +96,9 @@ pub async fn handshake( if !read_buf.is_empty() { return Err(HandshakeError::EarlyData); } - let tls_stream = raw.upgrade(tls.to_server_config()).await?; + let tls_stream = raw + .upgrade(tls.to_server_config(), record_handshake_error) + .await?; let (_, tls_server_end_point) = tls .cert_resolver diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index c81a1a8292..62de79946f 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -2,11 +2,10 @@ use crate::{ cancellation, compute::PostgresConnection, console::messages::MetricsAuxInfo, - metrics::NUM_BYTES_PROXIED_COUNTER, + metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}, stream::Stream, usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}, }; -use metrics::IntCounterPairGuard; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; use utils::measured_stream::MeasuredStream; @@ -23,24 +22,25 @@ pub async fn proxy_pass( branch_id: aux.branch_id, }); - let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]); + let metrics = &Metrics::get().proxy.io_bytes; + let m_sent = metrics.with_labels(Direction::Tx); let mut client = MeasuredStream::new( client, |_| {}, |cnt| { // Number of bytes we sent to the client (outbound). - m_sent.inc_by(cnt as u64); + metrics.get_metric(m_sent).inc_by(cnt as u64); usage.record_egress(cnt as u64); }, ); - let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]); + let m_recv = metrics.with_labels(Direction::Rx); let mut compute = MeasuredStream::new( compute, |_| {}, |cnt| { // Number of bytes the client sent to the compute node (inbound). - m_recv.inc_by(cnt as u64); + metrics.get_metric(m_recv).inc_by(cnt as u64); }, ); @@ -60,8 +60,8 @@ pub struct ProxyPassthrough { pub compute: PostgresConnection, pub aux: MetricsAuxInfo, - pub req: IntCounterPairGuard, - pub conn: IntCounterPairGuard, + pub req: NumConnectionRequestsGuard<'static>, + pub conn: NumClientConnectionsGuard<'static>, pub cancel: cancellation::Session

, } diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index a85ed380b0..36a05ba190 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -1,21 +1,18 @@ -use crate::compute; +use crate::{compute, config::RetryConfig}; use std::{error::Error, io}; use tokio::time; -/// Number of times we should retry the `/proxy_wake_compute` http request. -/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0 -pub const NUM_RETRIES_CONNECT: u32 = 16; -const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25); -const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2; - pub trait ShouldRetry { fn could_retry(&self) -> bool; - fn should_retry(&self, num_retries: u32) -> bool { + fn should_retry(&self, num_retries: u32, config: RetryConfig) -> bool { match self { - _ if num_retries >= NUM_RETRIES_CONNECT => false, + _ if num_retries >= config.max_retries => false, err => err.could_retry(), } } + fn should_retry_database_address(&self) -> bool { + true + } } impl ShouldRetry for io::Error { @@ -39,6 +36,21 @@ impl ShouldRetry for tokio_postgres::error::DbError { | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, ) } + fn should_retry_database_address(&self) -> bool { + use tokio_postgres::error::SqlState; + // Here are errors that happens after the user successfully authenticated to the database. + // TODO: there are pgbouncer errors that should be retried, but they are not listed here. + !matches!( + self.code(), + &SqlState::TOO_MANY_CONNECTIONS + | &SqlState::OUT_OF_MEMORY + | &SqlState::SYNTAX_ERROR + | &SqlState::T_R_SERIALIZATION_FAILURE + | &SqlState::INVALID_CATALOG_NAME + | &SqlState::INVALID_SCHEMA_NAME + | &SqlState::INVALID_PARAMETER_VALUE + ) + } } impl ShouldRetry for tokio_postgres::Error { @@ -51,6 +63,15 @@ impl ShouldRetry for tokio_postgres::Error { false } } + fn should_retry_database_address(&self) -> bool { + if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) { + io::Error::should_retry_database_address(io_err) + } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { + tokio_postgres::error::DbError::should_retry_database_address(db_err) + } else { + true + } + } } impl ShouldRetry for compute::ConnectionError { @@ -61,8 +82,17 @@ impl ShouldRetry for compute::ConnectionError { _ => false, } } + fn should_retry_database_address(&self) -> bool { + match self { + compute::ConnectionError::Postgres(err) => err.should_retry_database_address(), + compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(), + _ => true, + } + } } -pub fn retry_after(num_retries: u32) -> time::Duration { - BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1)) +pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration { + config + .base_delay + .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1)) } diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 71d85e106d..ad48af0093 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -10,13 +10,13 @@ use super::*; use crate::auth::backend::{ ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend, }; -use crate::config::CertResolver; +use crate::config::{CertResolver, RetryConfig}; use crate::console::caches::NodeInfoCache; use crate::console::messages::MetricsAuxInfo; use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend}; use crate::console::{self, CachedNodeInfo, NodeInfo}; use crate::error::ErrorKind; -use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT}; +use crate::proxy::retry::retry_after; use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId}; use anyhow::{bail, Context}; use async_trait::async_trait; @@ -174,8 +174,8 @@ async fn dummy_proxy( tls: Option, auth: impl TestAuth + Send, ) -> anyhow::Result<()> { - let client = WithClientIp::new(client); - let mut stream = match handshake(client, tls.as_ref()).await? { + let (client, _) = read_proxy_protocol(client).await?; + let mut stream = match handshake(client, tls.as_ref(), false).await? { HandshakeData::Startup(stream, _) => stream, HandshakeData::Cancel(_) => bail!("cancellation not supported"), }; @@ -361,11 +361,15 @@ async fn scram_auth_mock() -> anyhow::Result<()> { #[test] fn connect_compute_total_wait() { let mut total_wait = tokio::time::Duration::ZERO; - for num_retries in 1..NUM_RETRIES_CONNECT { - total_wait += retry_after(num_retries); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + for num_retries in 1..config.max_retries { + total_wait += retry_after(num_retries, config); } - assert!(total_wait < tokio::time::Duration::from_secs(12)); - assert!(total_wait > tokio::time::Duration::from_secs(10)); + assert!(f64::abs(total_wait.as_secs_f64() - 15.0) < 0.1); } #[derive(Clone, Copy, Debug)] @@ -549,7 +553,12 @@ async fn connect_to_compute_success() { let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, &user_info, false) + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -562,7 +571,12 @@ async fn connect_to_compute_retry() { let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, &user_info, false) + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -576,7 +590,12 @@ async fn connect_to_compute_non_retry_1() { let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]); let user_info = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, &user_info, false) + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap_err(); mechanism.verify(); @@ -590,7 +609,12 @@ async fn connect_to_compute_non_retry_2() { let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, &user_info, false) + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -600,17 +624,32 @@ async fn connect_to_compute_non_retry_2() { #[tokio::test] async fn connect_to_compute_non_retry_3() { let _ = env_logger::try_init(); - assert_eq!(NUM_RETRIES_CONNECT, 16); + tokio::time::pause(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![ - Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, - Retry, Retry, Retry, Retry, Retry, /* the 17th time */ Retry, - ]); + let mechanism = + TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]); let user_info = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, &user_info, false) - .await - .unwrap_err(); + let wake_compute_retry_config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 1, + backoff_factor: 2.0, + }; + let connect_to_compute_retry_config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute( + &mut ctx, + &mechanism, + &user_info, + false, + wake_compute_retry_config, + connect_to_compute_retry_config, + ) + .await + .unwrap_err(); mechanism.verify(); } @@ -622,7 +661,12 @@ async fn wake_retry() { let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, &user_info, false) + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -636,7 +680,12 @@ async fn wake_non_retry() { let mut ctx = RequestMonitoring::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]); let user_info = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, &user_info, false) + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap_err(); mechanism.verify(); diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index 3b760e5dab..cbfc9f1358 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -34,7 +34,10 @@ async fn proxy_mitm( tokio::spawn(async move { // begin handshake with end_server let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await; - let (end_client, startup) = match handshake(client1, Some(&server_config1)).await.unwrap() { + let (end_client, startup) = match handshake(client1, Some(&server_config1), false) + .await + .unwrap() + { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(_) => panic!("cancellation not supported"), }; diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index bfe4b7ec3a..3d9e94dd72 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,10 +1,14 @@ +use crate::config::RetryConfig; use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo}; use crate::context::RequestMonitoring; -use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES}; +use crate::metrics::{ + ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, + WakeupFailureKind, +}; use crate::proxy::retry::retry_after; -use hyper::StatusCode; +use hyper1::StatusCode; use std::ops::ControlFlow; -use tracing::{error, warn}; +use tracing::{error, info, warn}; use super::connect_compute::ComputeConnectBackend; use super::retry::ShouldRetry; @@ -13,25 +17,48 @@ pub async fn wake_compute( num_retries: &mut u32, ctx: &mut RequestMonitoring, api: &B, + config: RetryConfig, ) -> Result { + let retry_type = RetryType::WakeCompute; loop { let wake_res = api.wake_compute(ctx).await; - match handle_try_wake(wake_res, *num_retries) { + match handle_try_wake(wake_res, *num_retries, config) { Err(e) => { error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); report_error(&e, false); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Failed, + retry_type, + }, + (*num_retries).into(), + ); return Err(e); } Ok(ControlFlow::Continue(e)) => { warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); report_error(&e, true); } - Ok(ControlFlow::Break(n)) => return Ok(n), + Ok(ControlFlow::Break(n)) => { + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Success, + retry_type, + }, + (*num_retries).into(), + ); + info!(?num_retries, "compute node woken up after"); + return Ok(n); + } } - let wait_duration = retry_after(*num_retries); + let wait_duration = retry_after(*num_retries, config); *num_retries += 1; + let pause = ctx + .latency_timer + .pause(crate::metrics::Waiting::RetryTimeout); tokio::time::sleep(wait_duration).await; + drop(pause); } } @@ -42,10 +69,11 @@ pub async fn wake_compute( pub fn handle_try_wake( result: Result, num_retries: u32, + config: RetryConfig, ) -> Result, WakeComputeError> { match result { Err(err) => match &err { - WakeComputeError::ApiError(api) if api.should_retry(num_retries) => { + WakeComputeError::ApiError(api) if api.should_retry(num_retries, config) => { Ok(ControlFlow::Continue(err)) } _ => Err(err), @@ -57,39 +85,47 @@ pub fn handle_try_wake( fn report_error(e: &WakeComputeError, retry: bool) { use crate::console::errors::ApiError; - let retry = bool_to_str(retry); let kind = match e { - WakeComputeError::BadComputeAddress(_) => "bad_compute_address", - WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error", + WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress, + WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError, WakeComputeError::ApiError(ApiError::Console { status: StatusCode::LOCKED, ref text, }) if text.contains("written data quota exceeded") || text.contains("the limit for current plan reached") => { - "quota_exceeded" + WakeupFailureKind::QuotaExceeded } WakeComputeError::ApiError(ApiError::Console { status: StatusCode::UNPROCESSABLE_ENTITY, ref text, }) if text.contains("compute time quota of non-primary branches is exceeded") => { - "quota_exceeded" + WakeupFailureKind::QuotaExceeded } WakeComputeError::ApiError(ApiError::Console { status: StatusCode::LOCKED, .. - }) => "api_console_locked", + }) => WakeupFailureKind::ApiConsoleLocked, WakeComputeError::ApiError(ApiError::Console { status: StatusCode::BAD_REQUEST, .. - }) => "api_console_bad_request", + }) => WakeupFailureKind::ApiConsoleBadRequest, WakeComputeError::ApiError(ApiError::Console { status, .. }) if status.is_server_error() => { - "api_console_other_server_error" + WakeupFailureKind::ApiConsoleOtherServerError } - WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error", - WakeComputeError::TimeoutError => "timeout_error", + WakeComputeError::ApiError(ApiError::Console { .. }) => { + WakeupFailureKind::ApiConsoleOtherError + } + WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked, + WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError, }; - NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc(); + Metrics::get() + .proxy + .connection_failures_breakdown + .inc(ConnectionFailuresBreakdownGroup { + kind, + retry: retry.into(), + }); } diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index 13dffffca0..c542267547 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -1,7 +1,2 @@ -mod aimd; -mod limit_algorithm; mod limiter; -pub use aimd::Aimd; -pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig}; -pub use limiter::Limiter; -pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter}; +pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo}; diff --git a/proxy/src/rate_limiter/aimd.rs b/proxy/src/rate_limiter/aimd.rs deleted file mode 100644 index 2c14a54a6c..0000000000 --- a/proxy/src/rate_limiter/aimd.rs +++ /dev/null @@ -1,166 +0,0 @@ -use std::usize; - -use async_trait::async_trait; - -use super::limit_algorithm::{AimdConfig, LimitAlgorithm, Sample}; - -use super::limiter::Outcome; - -/// Loss-based congestion avoidance. -/// -/// Additive-increase, multiplicative decrease. -/// -/// Adds available currency when: -/// 1. no load-based errors are observed, and -/// 2. the utilisation of the current limit is high. -/// -/// Reduces available concurrency by a factor when load-based errors are detected. -pub struct Aimd { - min_limit: usize, - max_limit: usize, - decrease_factor: f32, - increase_by: usize, - min_utilisation_threshold: f32, -} - -impl Aimd { - pub fn new(config: AimdConfig) -> Self { - Self { - min_limit: config.aimd_min_limit, - max_limit: config.aimd_max_limit, - decrease_factor: config.aimd_decrease_factor, - increase_by: config.aimd_increase_by, - min_utilisation_threshold: config.aimd_min_utilisation_threshold, - } - } -} - -#[async_trait] -impl LimitAlgorithm for Aimd { - async fn update(&mut self, old_limit: usize, sample: Sample) -> usize { - use Outcome::*; - match sample.outcome { - Success => { - let utilisation = sample.in_flight as f32 / old_limit as f32; - - if utilisation > self.min_utilisation_threshold { - let limit = old_limit + self.increase_by; - limit.clamp(self.min_limit, self.max_limit) - } else { - old_limit - } - } - Overload => { - let limit = old_limit as f32 * self.decrease_factor; - - // Floor instead of round, so the limit reduces even with small numbers. - // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1 - let limit = limit.floor() as usize; - - limit.clamp(self.min_limit, self.max_limit) - } - } - } -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use tokio::sync::Notify; - - use super::*; - - use crate::rate_limiter::{Limiter, RateLimiterConfig}; - - #[tokio::test] - async fn should_decrease_limit_on_overload() { - let config = RateLimiterConfig { - initial_limit: 10, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let release_notifier = Arc::new(Notify::new()); - - let limiter = Limiter::new(config).with_release_notifier(release_notifier.clone()); - - let token = limiter.try_acquire().unwrap(); - limiter.release(token, Some(Outcome::Overload)).await; - release_notifier.notified().await; - assert_eq!(limiter.state().limit(), 5, "overload: decrease"); - } - - #[tokio::test] - async fn should_increase_limit_on_success_when_using_gt_util_threshold() { - let config = RateLimiterConfig { - initial_limit: 4, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - aimd_increase_by: 1, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - let _token = limiter.try_acquire().unwrap(); - let _token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - assert_eq!(limiter.state().limit(), 5, "success: increase"); - } - - #[tokio::test] - async fn should_not_change_limit_on_success_when_using_lt_util_threshold() { - let config = RateLimiterConfig { - initial_limit: 4, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - assert_eq!( - limiter.state().limit(), - 4, - "success: ignore when < half limit" - ); - } - - #[tokio::test] - async fn should_not_change_limit_when_no_outcome() { - let config = RateLimiterConfig { - initial_limit: 10, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - limiter.release(token, None).await; - assert_eq!(limiter.state().limit(), 10, "ignore"); - } -} diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs deleted file mode 100644 index 5cd2d5ebb7..0000000000 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ /dev/null @@ -1,98 +0,0 @@ -//! Algorithms for controlling concurrency limits. -use async_trait::async_trait; -use std::time::Duration; - -use super::{limiter::Outcome, Aimd}; - -/// An algorithm for controlling a concurrency limit. -#[async_trait] -pub trait LimitAlgorithm: Send + Sync + 'static { - /// Update the concurrency limit in response to a new job completion. - async fn update(&mut self, old_limit: usize, sample: Sample) -> usize; -} - -/// The result of a job (or jobs), including the [Outcome] (loss) and latency (delay). -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Sample { - pub(crate) latency: Duration, - /// Jobs in flight when the sample was taken. - pub(crate) in_flight: usize, - pub(crate) outcome: Outcome, -} - -#[derive(Clone, Copy, Debug, Default, clap::ValueEnum)] -pub enum RateLimitAlgorithm { - Fixed, - #[default] - Aimd, -} - -pub struct Fixed; - -#[async_trait] -impl LimitAlgorithm for Fixed { - async fn update(&mut self, old_limit: usize, _sample: Sample) -> usize { - old_limit - } -} - -#[derive(Clone, Copy, Debug)] -pub struct RateLimiterConfig { - pub disable: bool, - pub algorithm: RateLimitAlgorithm, - pub timeout: Duration, - pub initial_limit: usize, - pub aimd_config: Option, -} - -impl RateLimiterConfig { - pub fn create_rate_limit_algorithm(self) -> Box { - match self.algorithm { - RateLimitAlgorithm::Fixed => Box::new(Fixed), - RateLimitAlgorithm::Aimd => Box::new(Aimd::new(self.aimd_config.unwrap())), // For aimd algorithm config is mandatory. - } - } -} - -impl Default for RateLimiterConfig { - fn default() -> Self { - Self { - disable: true, - algorithm: RateLimitAlgorithm::Aimd, - timeout: Duration::from_secs(1), - initial_limit: 100, - aimd_config: Some(AimdConfig::default()), - } - } -} - -#[derive(clap::Parser, Clone, Copy, Debug)] -pub struct AimdConfig { - /// Minimum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 1)] - pub aimd_min_limit: usize, - /// Maximum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 1500)] - pub aimd_max_limit: usize, - /// Increase AIMD increase by value in case of success. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 10)] - pub aimd_increase_by: usize, - /// Decrease AIMD decrease by value in case of timout/429. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 0.9)] - pub aimd_decrease_factor: f32, - /// A threshold below which the limit won't be increased. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 0.8)] - pub aimd_min_utilisation_threshold: f32, -} - -impl Default for AimdConfig { - fn default() -> Self { - Self { - aimd_min_limit: 1, - aimd_max_limit: 1500, - aimd_increase_by: 10, - aimd_decrease_factor: 0.9, - aimd_min_utilisation_threshold: 0.8, - } - } -} diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index f590896dd9..5ba2c36436 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -2,10 +2,9 @@ use std::{ borrow::Cow, collections::hash_map::RandomState, hash::{BuildHasher, Hash}, - net::IpAddr, sync::{ atomic::{AtomicUsize, Ordering}, - Arc, Mutex, + Mutex, }, }; @@ -13,24 +12,18 @@ use anyhow::bail; use dashmap::DashMap; use itertools::Itertools; use rand::{rngs::StdRng, Rng, SeedableRng}; -use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit}; -use tokio::time::{timeout, Duration, Instant}; +use tokio::time::{Duration, Instant}; use tracing::info; -use crate::{intern::EndpointIdInt, EndpointId}; +use crate::intern::EndpointIdInt; -use super::{ - limit_algorithm::{LimitAlgorithm, Sample}, - RateLimiterConfig, -}; - -pub struct RedisRateLimiter { +pub struct GlobalRateLimiter { data: Vec, - info: &'static [RateBucketInfo], + info: Vec, } -impl RedisRateLimiter { - pub fn new(info: &'static [RateBucketInfo]) -> Self { +impl GlobalRateLimiter { + pub fn new(info: Vec) -> Self { Self { data: vec![ RateBucket { @@ -50,7 +43,7 @@ impl RedisRateLimiter { let should_allow_request = self .data .iter_mut() - .zip(self.info) + .zip(&self.info) .all(|(bucket, info)| bucket.should_allow_request(info, now, 1)); if should_allow_request { @@ -68,15 +61,7 @@ impl RedisRateLimiter { // Purposefully ignore user name and database name as clients can reconnect // with different names, so we'll end up sending some http requests to // the control plane. -// -// We also may save quite a lot of CPU (I think) by bailing out right after we -// saw SNI, before doing TLS handshake. User-side error messages in that case -// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now -// I went with a more expensive way that yields user-friendlier error messages. -pub type EndpointRateLimiter = BucketRateLimiter; - -// This can't be just per IP because that would limit some PaaS that share IP addresses -pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, IpAddr), StdRng, RandomState>; +pub type EndpointRateLimiter = BucketRateLimiter; pub struct BucketRateLimiter { map: DashMap, Hasher>, @@ -149,19 +134,6 @@ impl RateBucketInfo { Self::new(100, Duration::from_secs(600)), ]; - /// All of these are per endpoint-ip pair. - /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). - /// - /// First bucket: 300mcpus total per endpoint-ip pair - /// * 1228800 requests per second with 1 hash rounds. (endpoint rate limiter will catch this first) - /// * 300 requests per second with 4096 hash rounds. - /// * 2 requests per second with 600000 hash rounds. - pub const DEFAULT_AUTH_SET: [Self; 3] = [ - Self::new(300 * 4096, Duration::from_secs(1)), - Self::new(200 * 4096, Duration::from_secs(60)), - Self::new(100 * 4096, Duration::from_secs(600)), - ]; - pub fn validate(info: &mut [Self]) -> anyhow::Result<()> { info.sort_unstable_by_key(|info| info.interval); let invalid = info @@ -259,419 +231,16 @@ impl BucketRateLimiter { } } -/// Limits the number of concurrent jobs. -/// -/// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the -/// token once the job is finished. -/// -/// The limit will be automatically adjusted based on observed latency (delay) and/or failures -/// caused by overload (loss). -pub struct Limiter { - limit_algo: AsyncMutex>, - semaphore: std::sync::Arc, - config: RateLimiterConfig, - - // ONLY WRITE WHEN LIMIT_ALGO IS LOCKED - limits: AtomicUsize, - - // ONLY USE ATOMIC ADD/SUB - in_flight: Arc, - - #[cfg(test)] - notifier: Option>, -} - -/// A concurrency token, required to run a job. -/// -/// Release the token back to the [Limiter] after the job is complete. -#[derive(Debug)] -pub struct Token<'t> { - permit: Option>, - start: Instant, - in_flight: Arc, -} - -/// A snapshot of the state of the [Limiter]. -/// -/// Not guaranteed to be consistent under high concurrency. -#[derive(Debug, Clone, Copy)] -pub struct LimiterState { - limit: usize, - in_flight: usize, -} - -/// Whether a job succeeded or failed as a result of congestion/overload. -/// -/// Errors not considered to be caused by overload should be ignored. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Outcome { - /// The job succeeded, or failed in a way unrelated to overload. - Success, - /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal - /// was observed. - Overload, -} - -impl Outcome { - fn from_reqwest_error(error: &reqwest_middleware::Error) -> Self { - match error { - reqwest_middleware::Error::Middleware(_) => Outcome::Success, - reqwest_middleware::Error::Reqwest(e) => { - if let Some(status) = e.status() { - if status.is_server_error() - || reqwest::StatusCode::TOO_MANY_REQUESTS.as_u16() == status - { - Outcome::Overload - } else { - Outcome::Success - } - } else { - Outcome::Success - } - } - } - } - fn from_reqwest_response(response: &reqwest::Response) -> Self { - if response.status().is_server_error() - || response.status() == reqwest::StatusCode::TOO_MANY_REQUESTS - { - Outcome::Overload - } else { - Outcome::Success - } - } -} - -impl Limiter { - /// Create a limiter with a given limit control algorithm. - pub fn new(config: RateLimiterConfig) -> Self { - assert!(config.initial_limit > 0); - Self { - limit_algo: AsyncMutex::new(config.create_rate_limit_algorithm()), - semaphore: Arc::new(Semaphore::new(config.initial_limit)), - config, - limits: AtomicUsize::new(config.initial_limit), - in_flight: Arc::new(AtomicUsize::new(0)), - #[cfg(test)] - notifier: None, - } - } - // pub fn new(limit_algorithm: T, timeout: Duration, initial_limit: usize) -> Self { - // assert!(initial_limit > 0); - - // Self { - // limit_algo: AsyncMutex::new(limit_algorithm), - // semaphore: Arc::new(Semaphore::new(initial_limit)), - // timeout, - // limits: AtomicUsize::new(initial_limit), - // in_flight: Arc::new(AtomicUsize::new(0)), - // #[cfg(test)] - // notifier: None, - // } - // } - - /// In some cases [Token]s are acquired asynchronously when updating the limit. - #[cfg(test)] - pub fn with_release_notifier(mut self, n: std::sync::Arc) -> Self { - self.notifier = Some(n); - self - } - - /// Try to immediately acquire a concurrency [Token]. - /// - /// Returns `None` if there are none available. - pub fn try_acquire(&self) -> Option { - let result = if self.config.disable { - // If the rate limiter is disabled, we can always acquire a token. - Some(Token::new(None, self.in_flight.clone())) - } else { - self.semaphore - .try_acquire() - .map(|permit| Token::new(Some(permit), self.in_flight.clone())) - .ok() - }; - if result.is_some() { - self.in_flight.fetch_add(1, Ordering::AcqRel); - } - result - } - - /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available. - /// - /// Returns `None` if there are none available after `duration`. - pub async fn acquire_timeout(&self, duration: Duration) -> Option> { - info!("acquiring token: {:?}", self.semaphore.available_permits()); - let result = if self.config.disable { - // If the rate limiter is disabled, we can always acquire a token. - Some(Token::new(None, self.in_flight.clone())) - } else { - match timeout(duration, self.semaphore.acquire()).await { - Ok(maybe_permit) => maybe_permit - .map(|permit| Token::new(Some(permit), self.in_flight.clone())) - .ok(), - Err(_) => None, - } - }; - if result.is_some() { - self.in_flight.fetch_add(1, Ordering::AcqRel); - } - result - } - - /// Return the concurrency [Token], along with the outcome of the job. - /// - /// The [Outcome] of the job, and the time taken to perform it, may be used - /// to update the concurrency limit. - /// - /// Set the outcome to `None` to ignore the job. - pub async fn release(&self, mut token: Token<'_>, outcome: Option) { - tracing::info!("outcome is {:?}", outcome); - let in_flight = self.in_flight.load(Ordering::Acquire); - let old_limit = self.limits.load(Ordering::Acquire); - let available = if self.config.disable { - 0 // This is not used in the algorithm and can be anything. If the config disable it makes sense to set it to 0. - } else { - self.semaphore.available_permits() - }; - let total = in_flight + available; - - let mut algo = self.limit_algo.lock().await; - - let new_limit = if let Some(outcome) = outcome { - let sample = Sample { - latency: token.start.elapsed(), - in_flight, - outcome, - }; - algo.update(old_limit, sample).await - } else { - old_limit - }; - tracing::info!("new limit is {}", new_limit); - let actual_limit = if new_limit < total { - token.forget(); - total.saturating_sub(1) - } else { - if !self.config.disable { - self.semaphore.add_permits(new_limit.saturating_sub(total)); - } - new_limit - }; - crate::metrics::RATE_LIMITER_LIMIT - .with_label_values(&["expected"]) - .set(new_limit as i64); - crate::metrics::RATE_LIMITER_LIMIT - .with_label_values(&["actual"]) - .set(actual_limit as i64); - self.limits.store(new_limit, Ordering::Release); - #[cfg(test)] - if let Some(n) = &self.notifier { - n.notify_one(); - } - } - - /// The current state of the limiter. - pub fn state(&self) -> LimiterState { - let limit = self.limits.load(Ordering::Relaxed); - let in_flight = self.in_flight.load(Ordering::Relaxed); - LimiterState { limit, in_flight } - } -} - -impl<'t> Token<'t> { - fn new(permit: Option>, in_flight: Arc) -> Self { - Self { - permit, - start: Instant::now(), - in_flight, - } - } - - pub fn forget(&mut self) { - if let Some(permit) = self.permit.take() { - permit.forget(); - } - } -} - -impl Drop for Token<'_> { - fn drop(&mut self) { - self.in_flight.fetch_sub(1, Ordering::AcqRel); - } -} - -impl LimiterState { - /// The current concurrency limit. - pub fn limit(&self) -> usize { - self.limit - } - /// The number of jobs in flight. - pub fn in_flight(&self) -> usize { - self.in_flight - } -} - -#[async_trait::async_trait] -impl reqwest_middleware::Middleware for Limiter { - async fn handle( - &self, - req: reqwest::Request, - extensions: &mut task_local_extensions::Extensions, - next: reqwest_middleware::Next<'_>, - ) -> reqwest_middleware::Result { - let start = Instant::now(); - let token = self - .acquire_timeout(self.config.timeout) - .await - .ok_or_else(|| { - reqwest_middleware::Error::Middleware( - // TODO: Should we map it into user facing errors? - crate::console::errors::ApiError::Console { - status: crate::http::StatusCode::TOO_MANY_REQUESTS, - text: "Too many requests".into(), - } - .into(), - ) - })?; - info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane"); - crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64()); - match next.run(req, extensions).await { - Ok(response) => { - self.release(token, Some(Outcome::from_reqwest_response(&response))) - .await; - Ok(response) - } - Err(e) => { - self.release(token, Some(Outcome::from_reqwest_error(&e))) - .await; - Err(e) - } - } - } -} - #[cfg(test)] mod tests { - use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration}; + use std::{hash::BuildHasherDefault, time::Duration}; - use futures::{task::noop_waker_ref, Future}; use rand::SeedableRng; use rustc_hash::FxHasher; use tokio::time; - use super::{BucketRateLimiter, EndpointRateLimiter, Limiter, Outcome}; - use crate::{ - rate_limiter::{RateBucketInfo, RateLimitAlgorithm}, - EndpointId, - }; - - #[tokio::test] - async fn it_works() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 10, - disable: false, - ..Default::default() - }; - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - - assert_eq!(limiter.state().limit(), 10); - } - - #[tokio::test] - async fn is_fair() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 1, - disable: false, - ..Default::default() - }; - let limiter = Limiter::new(config); - - // === TOKEN 1 === - let token1 = limiter.try_acquire().unwrap(); - - let mut token2_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1))); - assert!( - token2_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token1" - ); - - let mut token3_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1))); - assert!( - token3_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token1" - ); - - limiter.release(token1, Some(Outcome::Success)).await; - // === END TOKEN 1 === - - // === TOKEN 2 === - assert!( - limiter.try_acquire().is_none(), - "token is acquired by token2" - ); - - assert!( - token3_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token2" - ); - - let token2 = token2_fut.await.unwrap(); - - limiter.release(token2, Some(Outcome::Success)).await; - // === END TOKEN 2 === - - // === TOKEN 3 === - assert!( - limiter.try_acquire().is_none(), - "token is acquired by token3" - ); - - let token3 = token3_fut.await.unwrap(); - limiter.release(token3, Some(Outcome::Success)).await; - // === END TOKEN 3 === - - // === TOKEN 4 === - let token4 = limiter.try_acquire().unwrap(); - limiter.release(token4, Some(Outcome::Success)).await; - } - - #[tokio::test] - async fn disable() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 1, - disable: true, - ..Default::default() - }; - let limiter = Limiter::new(config); - - // === TOKEN 1 === - let token1 = limiter.try_acquire().unwrap(); - let token2 = limiter.try_acquire().unwrap(); - let state = limiter.state(); - assert_eq!(state.limit(), 1); - assert_eq!(state.in_flight(), 2); // For disabled limiter, it's expected. - limiter.release(token1, None).await; - limiter.release(token2, None).await; - } + use super::{BucketRateLimiter, EndpointRateLimiter}; + use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId}; #[test] fn rate_bucket_rpi() { @@ -721,39 +290,40 @@ mod tests { let limiter = EndpointRateLimiter::new(rates); let endpoint = EndpointId::from("ep-my-endpoint-1234"); + let endpoint = EndpointIdInt::from(endpoint); time::pause(); for _ in 0..100 { - assert!(limiter.check(endpoint.clone(), 1)); + assert!(limiter.check(endpoint, 1)); } // more connections fail - assert!(!limiter.check(endpoint.clone(), 1)); + assert!(!limiter.check(endpoint, 1)); // fail even after 500ms as it's in the same bucket time::advance(time::Duration::from_millis(500)).await; - assert!(!limiter.check(endpoint.clone(), 1)); + assert!(!limiter.check(endpoint, 1)); // after a full 1s, 100 requests are allowed again time::advance(time::Duration::from_millis(500)).await; for _ in 1..6 { for _ in 0..50 { - assert!(limiter.check(endpoint.clone(), 2)); + assert!(limiter.check(endpoint, 2)); } time::advance(time::Duration::from_millis(1000)).await; } // more connections after 600 will exceed the 20rps@30s limit - assert!(!limiter.check(endpoint.clone(), 1)); + assert!(!limiter.check(endpoint, 1)); // will still fail before the 30 second limit time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await; - assert!(!limiter.check(endpoint.clone(), 1)); + assert!(!limiter.check(endpoint, 1)); // after the full 30 seconds, 100 requests are allowed again time::advance(time::Duration::from_millis(1)).await; for _ in 0..100 { - assert!(limiter.check(endpoint.clone(), 1)); + assert!(limiter.check(endpoint, 1)); } } @@ -773,31 +343,4 @@ mod tests { } assert!(limiter.map.len() < 150_000); } - - #[test] - fn test_default_auth_set() { - // these values used to exceed u32::MAX - assert_eq!( - RateBucketInfo::DEFAULT_AUTH_SET, - [ - RateBucketInfo { - interval: Duration::from_secs(1), - max_rpi: 300 * 4096, - }, - RateBucketInfo { - interval: Duration::from_secs(60), - max_rpi: 200 * 4096 * 60, - }, - RateBucketInfo { - interval: Duration::from_secs(600), - max_rpi: 100 * 4096 * 600, - } - ] - ); - - for x in RateBucketInfo::DEFAULT_AUTH_SET { - let y = x.to_string().parse().unwrap(); - assert_eq!(x, y); - } - } } diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 422789813c..7baf104374 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -5,7 +5,7 @@ use redis::AsyncCommands; use tokio::sync::Mutex; use uuid::Uuid; -use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter}; +use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; use super::{ connection_with_credentials_provider::ConnectionWithCredentialsProvider, @@ -80,7 +80,7 @@ impl CancellationPublisher for Arc> { pub struct RedisPublisherClient { client: ConnectionWithCredentialsProvider, region_id: String, - limiter: RedisRateLimiter, + limiter: GlobalRateLimiter, } impl RedisPublisherClient { @@ -92,7 +92,7 @@ impl RedisPublisherClient { Ok(Self { client, region_id, - limiter: RedisRateLimiter::new(info), + limiter: GlobalRateLimiter::new(info.into()), }) } diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index d183abb53a..3a90d911c2 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -77,10 +77,14 @@ impl ConnectionWithCredentialsProvider { } } + async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> { + redis::cmd("PING").query_async(con).await + } + pub async fn connect(&mut self) -> anyhow::Result<()> { let _guard = self.mutex.lock().await; if let Some(con) = self.con.as_mut() { - match redis::cmd("PING").query_async(con).await { + match Self::ping(con).await { Ok(()) => { return Ok(()); } @@ -96,7 +100,7 @@ impl ConnectionWithCredentialsProvider { if let Some(f) = self.refresh_token_task.take() { f.abort() } - let con = self + let mut con = self .get_client() .await? .get_multiplexed_tokio_connection() @@ -109,6 +113,14 @@ impl ConnectionWithCredentialsProvider { }); self.refresh_token_task = Some(f); } + match Self::ping(&mut con).await { + Ok(()) => { + info!("Connection succesfully established"); + } + Err(e) => { + error!("Connection is broken. Error during PING: {e:?}"); + } + } self.con = Some(con); Ok(()) } diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 8b7e3e3419..87d723d17e 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -4,6 +4,7 @@ use futures::StreamExt; use pq_proto::CancelKeyData; use redis::aio::PubSub; use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; use uuid::Uuid; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; @@ -11,7 +12,7 @@ use crate::{ cache::project_info::ProjectInfoCache, cancellation::{CancelMap, CancellationHandler}, intern::{ProjectIdInt, RoleNameInt}, - metrics::{NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, REDIS_BROKEN_MESSAGES}, + metrics::{Metrics, RedisErrors, RedisEventsCount}, }; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; @@ -77,6 +78,16 @@ struct MessageHandler { region_id: String, } +impl Clone for MessageHandler { + fn clone(&self) -> Self { + Self { + cache: self.cache.clone(), + cancellation_handler: self.cancellation_handler.clone(), + region_id: self.region_id.clone(), + } + } +} + impl MessageHandler { pub fn new( cache: Arc, @@ -89,11 +100,11 @@ impl MessageHandler { region_id, } } - pub fn disable_ttl(&self) { - self.cache.disable_ttl(); + pub async fn increment_active_listeners(&self) { + self.cache.increment_active_listeners().await; } - pub fn enable_ttl(&self) { - self.cache.enable_ttl(); + pub async fn decrement_active_listeners(&self) { + self.cache.decrement_active_listeners().await; } #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))] async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> { @@ -104,9 +115,9 @@ impl MessageHandler { let msg: Notification = match serde_json::from_str(&payload) { Ok(msg) => msg, Err(e) => { - REDIS_BROKEN_MESSAGES - .with_label_values(&[msg.get_channel_name()]) - .inc(); + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: msg.get_channel_name(), + }); tracing::error!("broken message: {e}"); return Ok(()); } @@ -118,6 +129,10 @@ impl MessageHandler { "session_id", &tracing::field::display(cancel_session.session_id), ); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::CancelSession); if let Some(cancel_region) = cancel_session.region_id { // If the message is not for this region, ignore it. if cancel_region != self.region_id { @@ -138,6 +153,17 @@ impl MessageHandler { } _ => { invalidate_cache(self.cache.clone(), msg.clone()); + if matches!(msg, AllowedIpsUpdate { .. }) { + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::AllowedIpsUpdate); + } else if matches!(msg, PasswordUpdate { .. }) { + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::PasswordUpdate); + } // It might happen that the invalid entry is on the way to be cached. // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds. // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message. @@ -167,37 +193,24 @@ fn invalidate_cache(cache: Arc, msg: Notification) { } } -/// Handle console's invalidation messages. -#[tracing::instrument(name = "console_notifications", skip_all)] -pub async fn task_main( +async fn handle_messages( + handler: MessageHandler, redis: ConnectionWithCredentialsProvider, - cache: Arc, - cancel_map: CancelMap, - region_id: String, -) -> anyhow::Result -where - C: ProjectInfoCache + Send + Sync + 'static, -{ - cache.enable_ttl(); - let handler = MessageHandler::new( - cache, - Arc::new(CancellationHandler::<()>::new( - cancel_map, - NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, - )), - region_id, - ); - + cancellation_token: CancellationToken, +) -> anyhow::Result<()> { loop { + if cancellation_token.is_cancelled() { + return Ok(()); + } let mut conn = match try_connect(&redis).await { Ok(conn) => { - handler.disable_ttl(); + handler.increment_active_listeners().await; conn } Err(e) => { tracing::error!( - "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}" - ); + "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}" + ); tokio::time::sleep(RECONNECT_TIMEOUT).await; continue; } @@ -211,8 +224,47 @@ where break; } } + if cancellation_token.is_cancelled() { + handler.decrement_active_listeners().await; + return Ok(()); + } } - handler.enable_ttl(); + handler.decrement_active_listeners().await; + } +} + +/// Handle console's invalidation messages. +#[tracing::instrument(name = "redis_notifications", skip_all)] +pub async fn task_main( + redis: ConnectionWithCredentialsProvider, + cache: Arc, + cancel_map: CancelMap, + region_id: String, +) -> anyhow::Result +where + C: ProjectInfoCache + Send + Sync + 'static, +{ + let cancellation_handler = Arc::new(CancellationHandler::<()>::new( + cancel_map, + crate::metrics::CancellationSource::FromRedis, + )); + let handler = MessageHandler::new(cache, cancellation_handler, region_id); + // 6h - 1m. + // There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost. + let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60)); + loop { + let cancellation_token = CancellationToken::new(); + interval.tick().await; + + tokio::spawn(handle_messages( + handler.clone(), + redis.clone(), + cancellation_token.clone(), + )); + tokio::spawn(async move { + tokio::time::sleep(std::time::Duration::from_secs(6 * 60 * 60)).await; // 6h. + cancellation_token.cancel(); + }); } } diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index a2010fd613..1a0d1f7b0e 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -4,42 +4,47 @@ mod backend; mod conn_pool; +mod http_util; mod json; mod sql_over_http; -pub mod tls_listener; mod websocket; +use atomic_take::AtomicTake; +use bytes::Bytes; pub use conn_pool::GlobalConnPoolOptions; -use anyhow::bail; -use hyper::StatusCode; -use metrics::IntCounterPairGuard; +use anyhow::Context; +use futures::future::{select, Either}; +use futures::TryFutureExt; +use http::{Method, Response, StatusCode}; +use http_body_util::Full; +use hyper1::body::Incoming; +use hyper_util::rt::TokioExecutor; +use hyper_util::server::conn::auto::Builder; use rand::rngs::StdRng; use rand::SeedableRng; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; +use tokio::time::timeout; +use tokio_rustls::TlsAcceptor; use tokio_util::task::TaskTracker; -use tracing::instrument::Instrumented; use crate::cancellation::CancellationHandlerMain; use crate::config::ProxyConfig; use crate::context::RequestMonitoring; -use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard}; -use crate::rate_limiter::EndpointRateLimiter; +use crate::metrics::Metrics; +use crate::protocol2::read_proxy_protocol; +use crate::proxy::run_until_cancelled; use crate::serverless::backend::PoolingBackend; -use hyper::{ - server::conn::{AddrIncoming, AddrStream}, - Body, Method, Request, Response, -}; +use crate::serverless::http_util::{api_error_into_response, json_response}; -use std::net::IpAddr; +use std::net::{IpAddr, SocketAddr}; +use std::pin::pin; use std::sync::Arc; -use std::task::Poll; -use tls_listener::TlsListener; -use tokio::net::TcpListener; -use tokio_util::sync::{CancellationToken, DropGuard}; +use tokio::net::{TcpListener, TcpStream}; +use tokio_util::sync::CancellationToken; use tracing::{error, info, warn, Instrument}; -use utils::http::{error::ApiError, json::json_response}; +use utils::http::error::ApiError; pub const SERVERLESS_DRIVER_SNI: &str = "api"; @@ -47,7 +52,6 @@ pub async fn task_main( config: &'static ProxyConfig, ws_listener: TcpListener, cancellation_token: CancellationToken, - endpoint_rate_limiter: Arc, cancellation_handler: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { @@ -91,161 +95,178 @@ pub async fn task_main( tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into(); - let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?; - let _ = addr_incoming.set_nodelay(true); - let addr_incoming = ProxyProtocolAccept { - incoming: addr_incoming, - protocol: "http", - }; + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + connections.close(); // allows `connections.wait to complete` - let ws_connections = tokio_util::task::task_tracker::TaskTracker::new(); - ws_connections.close(); // allows `ws_connections.wait to complete` + let server = Builder::new(hyper_util::rt::TokioExecutor::new()); - let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout); + while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await { + let (conn, peer_addr) = res.context("could not accept TCP stream")?; + if let Err(e) = conn.set_nodelay(true) { + tracing::error!("could not set nodelay: {e}"); + continue; + } + let conn_id = uuid::Uuid::new_v4(); + let http_conn_span = tracing::info_span!("http_conn", ?conn_id); - let make_svc = hyper::service::make_service_fn( - |stream: &tokio_rustls::server::TlsStream< - WithConnectionGuard>, - >| { - let (conn, _) = stream.get_ref(); + connections.spawn( + connection_handler( + config, + backend.clone(), + connections.clone(), + cancellation_handler.clone(), + cancellation_token.clone(), + server.clone(), + tls_acceptor.clone(), + conn, + peer_addr, + ) + .instrument(http_conn_span), + ); + } - // this is jank. should dissapear with hyper 1.0 migration. - let gauge = conn - .gauge - .lock() - .expect("lock should not be poisoned") - .take() - .expect("gauge should be set on connection start"); - - // Cancel all current inflight HTTP requests if the HTTP connection is closed. - let http_cancellation_token = CancellationToken::new(); - let cancel_connection = http_cancellation_token.clone().drop_guard(); - - let span = conn.span.clone(); - let client_addr = conn.inner.client_addr(); - let remote_addr = conn.inner.inner.remote_addr(); - let backend = backend.clone(); - let ws_connections = ws_connections.clone(); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - let cancellation_handler = cancellation_handler.clone(); - async move { - let peer_addr = match client_addr { - Some(addr) => addr, - None if config.require_client_ip => bail!("missing required client ip"), - None => remote_addr, - }; - Ok(MetricService::new( - hyper::service::service_fn(move |req: Request| { - let backend = backend.clone(); - let ws_connections2 = ws_connections.clone(); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - let cancellation_handler = cancellation_handler.clone(); - let http_cancellation_token = http_cancellation_token.child_token(); - - // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. - // By spawning the future, we ensure it never gets cancelled until it decides to. - ws_connections.spawn( - async move { - // Cancel the current inflight HTTP request if the requets stream is closed. - // This is slightly different to `_cancel_connection` in that - // h2 can cancel individual requests with a `RST_STREAM`. - let _cancel_session = http_cancellation_token.clone().drop_guard(); - - let res = request_handler( - req, - config, - backend, - ws_connections2, - cancellation_handler, - peer_addr.ip(), - endpoint_rate_limiter, - http_cancellation_token, - ) - .await - .map_or_else(|e| e.into_response(), |r| r); - - _cancel_session.disarm(); - - res - } - .in_current_span(), - ) - }), - gauge, - cancel_connection, - span, - )) - } - }, - ); - - hyper::Server::builder(tls_listener) - .serve(make_svc) - .with_graceful_shutdown(cancellation_token.cancelled()) - .await?; - - // await websocket connections - ws_connections.wait().await; + connections.wait().await; Ok(()) } -struct MetricService { - inner: S, - _gauge: IntCounterPairGuard, - _cancel: DropGuard, - span: tracing::Span, -} +/// Handles the TCP lifecycle. +/// +/// 1. Parses PROXY protocol V2 +/// 2. Handles TLS handshake +/// 3. Handles HTTP connection +/// 1. With graceful shutdowns +/// 2. With graceful request cancellation with connection failure +/// 3. With websocket upgrade support. +#[allow(clippy::too_many_arguments)] +async fn connection_handler( + config: &'static ProxyConfig, + backend: Arc, + connections: TaskTracker, + cancellation_handler: Arc, + cancellation_token: CancellationToken, + server: Builder, + tls_acceptor: TlsAcceptor, + conn: TcpStream, + peer_addr: SocketAddr, +) { + let session_id = uuid::Uuid::new_v4(); -impl MetricService { - fn new( - inner: S, - _gauge: IntCounterPairGuard, - _cancel: DropGuard, - span: tracing::Span, - ) -> MetricService { - MetricService { - inner, - _gauge, - _cancel, - span, + let _gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Http); + + // handle PROXY protocol + let (conn, peer) = match read_proxy_protocol(conn).await { + Ok(c) => c, + Err(e) => { + tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); + return; } - } -} + }; -impl hyper::service::Service> for MetricService -where - S: hyper::service::Service>, -{ - type Response = S::Response; - type Error = S::Error; - type Future = Instrumented; + let peer_addr = peer.unwrap_or(peer_addr).ip(); + let has_private_peer_addr = match peer_addr { + IpAddr::V4(ip) => ip.is_private(), + _ => false, + }; + info!(?session_id, %peer_addr, "accepted new TCP connection"); - fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { - self.inner.poll_ready(cx) - } + // try upgrade to TLS, but with a timeout. + let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await { + Ok(Ok(conn)) => { + info!(?session_id, %peer_addr, "accepted new TLS connection"); + conn + } + // The handshake failed + Ok(Err(e)) => { + if !has_private_peer_addr { + Metrics::get().proxy.tls_handshake_failures.inc(); + } + warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); + return; + } + // The handshake timed out + Err(e) => { + if !has_private_peer_addr { + Metrics::get().proxy.tls_handshake_failures.inc(); + } + warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); + return; + } + }; - fn call(&mut self, req: Request) -> Self::Future { - self.span - .in_scope(|| self.inner.call(req)) - .instrument(self.span.clone()) + let session_id = AtomicTake::new(session_id); + + // Cancel all current inflight HTTP requests if the HTTP connection is closed. + let http_cancellation_token = CancellationToken::new(); + let _cancel_connection = http_cancellation_token.clone().drop_guard(); + + let conn = server.serve_connection_with_upgrades( + hyper_util::rt::TokioIo::new(conn), + hyper1::service::service_fn(move |req: hyper1::Request| { + // First HTTP request shares the same session ID + let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4); + + // Cancel the current inflight HTTP request if the requets stream is closed. + // This is slightly different to `_cancel_connection` in that + // h2 can cancel individual requests with a `RST_STREAM`. + let http_request_token = http_cancellation_token.child_token(); + let cancel_request = http_request_token.clone().drop_guard(); + + // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. + // By spawning the future, we ensure it never gets cancelled until it decides to. + let handler = connections.spawn( + request_handler( + req, + config, + backend.clone(), + connections.clone(), + cancellation_handler.clone(), + session_id, + peer_addr, + http_request_token, + ) + .in_current_span() + .map_ok_or_else(api_error_into_response, |r| r), + ); + + async move { + let res = handler.await; + cancel_request.disarm(); + res + } + }), + ); + + // On cancellation, trigger the HTTP connection handler to shut down. + let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await { + Either::Left((_cancelled, mut conn)) => { + conn.as_mut().graceful_shutdown(); + conn.await + } + Either::Right((res, _)) => res, + }; + + match res { + Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"), + Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"), } } #[allow(clippy::too_many_arguments)] async fn request_handler( - mut request: Request, + mut request: hyper1::Request, config: &'static ProxyConfig, backend: Arc, ws_connections: TaskTracker, cancellation_handler: Arc, + session_id: uuid::Uuid, peer_addr: IpAddr, - endpoint_rate_limiter: Arc, // used to cancel in-flight HTTP requests. not used to cancel websockets http_cancellation_token: CancellationToken, -) -> Result, ApiError> { - let session_id = uuid::Uuid::new_v4(); - +) -> Result>, ApiError> { let host = request .headers() .get("host") @@ -255,7 +276,13 @@ async fn request_handler( // Check if the request is a websocket upgrade request. if hyper_tungstenite::is_upgrade_request(&request) { - let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region); + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Ws, + &config.region, + ); + let span = ctx.span.clone(); info!(parent: &span, "performing websocket upgrade"); @@ -264,15 +291,9 @@ async fn request_handler( ws_connections.spawn( async move { - if let Err(e) = websocket::serve_websocket( - config, - ctx, - websocket, - cancellation_handler, - host, - endpoint_rate_limiter, - ) - .await + if let Err(e) = + websocket::serve_websocket(config, ctx, websocket, cancellation_handler, host) + .await { error!("error in websocket connection: {e:#}"); } @@ -282,14 +303,19 @@ async fn request_handler( // Return the response so the spawned future can continue. Ok(response) - } else if request.uri().path() == "/sql" && request.method() == Method::POST { - let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region); + } else if request.uri().path() == "/sql" && *request.method() == Method::POST { + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Http, + &config.region, + ); let span = ctx.span.clone(); sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) .instrument(span) .await - } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS { + } else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS { Response::builder() .header("Allow", "OPTIONS, POST") .header("Access-Control-Allow-Origin", "*") @@ -299,7 +325,7 @@ async fn request_handler( ) .header("Access-Control-Max-Age", "86400" /* 24 hours */) .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code - .body(Body::empty()) + .body(Full::new(Bytes::new())) .map_err(|e| ApiError::InternalServerError(e.into())) } else { json_response(StatusCode::BAD_REQUEST, "query is not supported") diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 8aa5ad4e8a..963913a260 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -6,14 +6,16 @@ use tracing::{field::display, info}; use crate::{ auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError}, compute, - config::ProxyConfig, + config::{AuthenticationConfig, ProxyConfig}, console::{ errors::{GetAuthInfoError, WakeComputeError}, + locks::ApiLocks, CachedNodeInfo, }, context::RequestMonitoring, error::{ErrorKind, ReportableError, UserFacingError}, - proxy::connect_compute::ConnectMechanism, + proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry}, + Host, }; use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; @@ -27,6 +29,7 @@ impl PoolingBackend { pub async fn authenticate( &self, ctx: &mut RequestMonitoring, + config: &AuthenticationConfig, conn_info: &ConnInfo, ) -> Result { let user_info = conn_info.user_info.clone(); @@ -43,6 +46,7 @@ impl PoolingBackend { let secret = match cached_secret.value.clone() { Some(secret) => self.config.authentication_config.check_rate_limit( ctx, + config, secret, &user_info.endpoint, true, @@ -103,9 +107,12 @@ impl PoolingBackend { conn_id, conn_info, pool: self.pool.clone(), + locks: &self.config.connect_compute_locks, }, &backend, false, // do not allow self signed compute for http flow + self.config.wake_compute_retry_config, + self.config.connect_to_compute_retry_config, ) .await } @@ -150,16 +157,31 @@ impl UserFacingError for HttpConnError { } } +impl ShouldRetry for HttpConnError { + fn could_retry(&self) -> bool { + match self { + HttpConnError::ConnectionError(e) => e.could_retry(), + HttpConnError::ConnectionClosedAbruptly(_) => false, + HttpConnError::GetAuthInfo(_) => false, + HttpConnError::AuthError(_) => false, + HttpConnError::WakeCompute(_) => false, + } + } +} + struct TokioMechanism { pool: Arc>, conn_info: ConnInfo, conn_id: uuid::Uuid, + + /// connect_to_compute concurrency lock + locks: &'static ApiLocks, } #[async_trait] impl ConnectMechanism for TokioMechanism { type Connection = Client; - type ConnectError = tokio_postgres::Error; + type ConnectError = HttpConnError; type Error = HttpConnError; async fn connect_once( @@ -168,6 +190,9 @@ impl ConnectMechanism for TokioMechanism { node_info: &CachedNodeInfo, timeout: Duration, ) -> Result { + let host = node_info.config.get_host()?; + let permit = self.locks.get_permit(&host).await?; + let mut config = (*node_info.config).clone(); let config = config .user(&self.conn_info.user_info.user) @@ -175,7 +200,10 @@ impl ConnectMechanism for TokioMechanism { .dbname(&self.conn_info.dbname) .connect_timeout(timeout); + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); let (client, connection) = config.connect(tokio_postgres::NoTls).await?; + drop(pause); + drop(permit); tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); Ok(poll_client( diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 35311facb8..798e488509 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,6 +1,5 @@ use dashmap::DashMap; use futures::{future::poll_fn, Future}; -use metrics::IntCounterPairGuard; use parking_lot::RwLock; use rand::Rng; use smallvec::SmallVec; @@ -16,13 +15,13 @@ use std::{ use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; +use tokio_util::sync::CancellationToken; use crate::console::messages::{ColdStartInfo, MetricsAuxInfo}; -use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL}; +use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; use crate::{ - auth::backend::ComputeUserInfo, context::RequestMonitoring, metrics::NUM_DB_CONNECTIONS_GAUGE, - DbName, EndpointCacheKey, RoleName, + auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName, }; use tracing::{debug, error, warn, Span}; @@ -78,7 +77,7 @@ pub struct EndpointConnPool { pools: HashMap<(DbName, RoleName), DbUserConnPool>, total_conns: usize, max_conns: usize, - _guard: IntCounterPairGuard, + _guard: HttpEndpointPoolsGuard<'static>, global_connections_count: Arc, global_pool_size_max_conns: usize, } @@ -110,7 +109,11 @@ impl EndpointConnPool { let removed = old_len - new_len; if removed > 0 { global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); } *total_conns -= removed; removed > 0 @@ -156,7 +159,11 @@ impl EndpointConnPool { pool.total_conns += 1; pool.global_connections_count .fetch_add(1, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.inc(); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .inc(); } pool.total_conns @@ -176,7 +183,11 @@ impl Drop for EndpointConnPool { if self.total_conns > 0 { self.global_connections_count .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(self.total_conns as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(self.total_conns as i64); } } } @@ -215,7 +226,11 @@ impl DbUserConnPool { removed += 1; } global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); conn } } @@ -303,7 +318,10 @@ impl GlobalConnPool { // acquire a random shard lock let mut shard = self.global_pool.shards()[shard].write(); - let timer = GC_LATENCY.start_timer(); + let timer = Metrics::get() + .proxy + .http_pool_reclaimation_lag_seconds + .start_timer(); let current_len = shard.len(); let mut clients_removed = 0; shard.retain(|endpoint, x| { @@ -331,7 +349,7 @@ impl GlobalConnPool { let new_len = shard.len(); drop(shard); - timer.observe_duration(); + timer.observe(); // Do logging outside of the lock. if clients_removed > 0 { @@ -339,7 +357,11 @@ impl GlobalConnPool { .global_connections_count .fetch_sub(clients_removed, atomic::Ordering::Relaxed) - clients_removed; - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(clients_removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(clients_removed as i64); info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); } let removed = current_len - new_len; @@ -410,7 +432,7 @@ impl GlobalConnPool { pools: HashMap::new(), total_conns: 0, max_conns: self.config.pool_options.max_conns_per_endpoint, - _guard: ENDPOINT_POOLS.guard(), + _guard: Metrics::get().proxy.http_endpoint_pools.guard(), global_connections_count: self.global_connections_count.clone(), global_pool_size_max_conns: self.config.pool_options.max_total_conns, })); @@ -450,9 +472,7 @@ pub fn poll_client( conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> Client { - let conn_gauge = NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(); + let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol); let mut session_id = ctx.session_id; let (tx, mut rx) = tokio::sync::watch::channel(session_id); @@ -469,15 +489,32 @@ pub fn poll_client( let db_user = conn_info.db_and_user(); let idle = global_pool.get_idle_timeout(); + let cancel = CancellationToken::new(); + let cancelled = cancel.clone().cancelled_owned(); + tokio::spawn( async move { let _conn_gauge = conn_gauge; let mut idle_timeout = pin!(tokio::time::sleep(idle)); + let mut cancelled = pin!(cancelled); + poll_fn(move |cx| { - if matches!(rx.has_changed(), Ok(true)) { - session_id = *rx.borrow_and_update(); - info!(%session_id, "changed session"); - idle_timeout.as_mut().reset(Instant::now() + idle); + if cancelled.as_mut().poll(cx).is_ready() { + info!("connection dropped"); + return Poll::Ready(()) + } + + match rx.has_changed() { + Ok(true) => { + session_id = *rx.borrow_and_update(); + info!(%session_id, "changed session"); + idle_timeout.as_mut().reset(Instant::now() + idle); + } + Err(_) => { + info!("connection dropped"); + return Poll::Ready(()) + } + _ => {} } // 5 minute idle connection timeout @@ -532,6 +569,7 @@ pub fn poll_client( let inner = ClientInner { inner: client, session: tx, + cancel, aux, conn_id, }; @@ -541,10 +579,18 @@ pub fn poll_client( struct ClientInner { inner: C, session: tokio::sync::watch::Sender, + cancel: CancellationToken, aux: MetricsAuxInfo, conn_id: uuid::Uuid, } +impl Drop for ClientInner { + fn drop(&mut self) { + // on client drop, tell the conn to shut down + self.cancel.cancel(); + } +} + pub trait ClientInnerExt: Sync + Send + 'static { fn is_closed(&self) -> bool; fn get_process_id(&self) -> i32; @@ -697,6 +743,7 @@ mod tests { ClientInner { inner: client, session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()), + cancel: CancellationToken::new(), aux: MetricsAuxInfo { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs new file mode 100644 index 0000000000..ab9127b13e --- /dev/null +++ b/proxy/src/serverless/http_util.rs @@ -0,0 +1,92 @@ +//! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility +//! Will merge back in at some point in the future. + +use bytes::Bytes; + +use anyhow::Context; +use http::{Response, StatusCode}; +use http_body_util::Full; + +use serde::Serialize; +use utils::http::error::ApiError; + +/// Like [`ApiError::into_response`] +pub fn api_error_into_response(this: ApiError) -> Response> { + match this { + ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( + format!("{err:#?}"), // use debug printing so that we give the cause + StatusCode::BAD_REQUEST, + ), + ApiError::Forbidden(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::FORBIDDEN) + } + ApiError::Unauthorized(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::UNAUTHORIZED) + } + ApiError::NotFound(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::NOT_FOUND) + } + ApiError::Conflict(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::CONFLICT) + } + ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status( + this.to_string(), + StatusCode::PRECONDITION_FAILED, + ), + ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status( + "Shutting down".to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), + ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), + ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::REQUEST_TIMEOUT, + ), + ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ), + } +} + +/// Same as [`utils::http::error::HttpErrorBody`] +#[derive(Serialize)] +struct HttpErrorBody { + pub msg: String, +} + +impl HttpErrorBody { + /// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`] + fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response> { + HttpErrorBody { msg }.to_response(status) + } + + /// Same as [`utils::http::error::HttpErrorBody::to_response`] + fn to_response(&self, status: StatusCode) -> Response> { + Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + // we do not have nested maps with non string keys so serialization shouldn't fail + .body(Full::new(Bytes::from(serde_json::to_string(self).unwrap()))) + .unwrap() + } +} + +/// Same as [`utils::http::json::json_response`] +pub fn json_response( + status: StatusCode, + data: T, +) -> Result>, ApiError> { + let json = serde_json::to_string(&data) + .context("Failed to serialize JSON response") + .map_err(ApiError::InternalServerError)?; + let response = Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + .body(Full::new(Bytes::from(json))) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + Ok(response) +} diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 00dffd5784..e856053a7e 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1,18 +1,22 @@ use std::pin::pin; use std::sync::Arc; +use bytes::Bytes; use futures::future::select; use futures::future::try_join; use futures::future::Either; use futures::StreamExt; use futures::TryFutureExt; -use hyper::body::HttpBody; -use hyper::header; -use hyper::http::HeaderName; -use hyper::http::HeaderValue; -use hyper::Response; -use hyper::StatusCode; -use hyper::{Body, HeaderMap, Request}; +use http_body_util::BodyExt; +use http_body_util::Full; +use hyper1::body::Body; +use hyper1::body::Incoming; +use hyper1::header; +use hyper1::http::HeaderName; +use hyper1::http::HeaderValue; +use hyper1::Response; +use hyper1::StatusCode; +use hyper1::{HeaderMap, Request}; use serde_json::json; use serde_json::Value; use tokio::time; @@ -29,7 +33,6 @@ use tracing::error; use tracing::info; use url::Url; use utils::http::error::ApiError; -use utils::http::json::json_response; use crate::auth::backend::ComputeUserInfo; use crate::auth::endpoint_sni; @@ -40,8 +43,8 @@ use crate::context::RequestMonitoring; use crate::error::ErrorKind; use crate::error::ReportableError; use crate::error::UserFacingError; -use crate::metrics::HTTP_CONTENT_LENGTH; -use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE; +use crate::metrics::HttpDirection; +use crate::metrics::Metrics; use crate::proxy::run_until_cancelled; use crate::proxy::NeonOptions; use crate::serverless::backend::HttpConnError; @@ -52,6 +55,7 @@ use crate::RoleName; use super::backend::PoolingBackend; use super::conn_pool::Client; use super::conn_pool::ConnInfo; +use super::http_util::json_response; use super::json::json_to_pg_text; use super::json::pg_text_row_to_json; use super::json::JsonConversionError; @@ -218,10 +222,10 @@ fn get_conn_info( pub async fn handle( config: &'static ProxyConfig, mut ctx: RequestMonitoring, - request: Request, + request: Request, backend: Arc, cancel: CancellationToken, -) -> Result, ApiError> { +) -> Result>, ApiError> { let result = handle_inner(cancel, config, &mut ctx, request, backend).await; let mut response = match result { @@ -332,10 +336,9 @@ pub async fn handle( } }; - response.headers_mut().insert( - "Access-Control-Allow-Origin", - hyper::http::HeaderValue::from_static("*"), - ); + response + .headers_mut() + .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*")); Ok(response) } @@ -396,7 +399,7 @@ impl UserFacingError for SqlOverHttpError { #[derive(Debug, thiserror::Error)] pub enum ReadPayloadError { #[error("could not read the HTTP request body: {0}")] - Read(#[from] hyper::Error), + Read(#[from] hyper1::Error), #[error("could not parse the HTTP request body: {0}")] Parse(#[from] serde_json::Error), } @@ -437,7 +440,7 @@ struct HttpHeaders { } impl HttpHeaders { - fn try_parse(headers: &hyper::http::HeaderMap) -> Result { + fn try_parse(headers: &hyper1::http::HeaderMap) -> Result { // Determine the output options. Default behaviour is 'false'. Anything that is not // strictly 'true' assumed to be false. let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); @@ -488,13 +491,14 @@ async fn handle_inner( cancel: CancellationToken, config: &'static ProxyConfig, ctx: &mut RequestMonitoring, - request: Request, + request: Request, backend: Arc, -) -> Result, SqlOverHttpError> { - let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(); - info!("handling interactive connection from client"); +) -> Result>, SqlOverHttpError> { + let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol); + info!( + protocol = %ctx.protocol, + "handling interactive connection from client" + ); // // Determine the destination and connection params @@ -517,9 +521,10 @@ async fn handle_inner( None => MAX_REQUEST_SIZE + 1, }; info!(request_content_length, "request size in bytes"); - HTTP_CONTENT_LENGTH - .with_label_values(&["request"]) - .observe(request_content_length as f64); + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Request, request_content_length as f64); // we don't have a streaming request support yet so this is to prevent OOM // from a malicious user sending an extremely large request body @@ -528,7 +533,7 @@ async fn handle_inner( } let fetch_and_process_request = async { - let body = hyper::body::to_bytes(request.into_body()).await?; + let body = request.into_body().collect().await?.to_bytes(); info!(length = body.len(), "request payload read"); let payload: Payload = serde_json::from_slice(&body)?; Ok::(payload) // Adjust error type accordingly @@ -536,7 +541,9 @@ async fn handle_inner( .map_err(SqlOverHttpError::from); let authenticate_and_connect = async { - let keys = backend.authenticate(ctx, &conn_info).await?; + let keys = backend + .authenticate(ctx, &config.authentication_config, &conn_info) + .await?; let client = backend .connect_to_compute(ctx, conn_info, keys, !allow_pool) .await?; @@ -596,7 +603,7 @@ async fn handle_inner( let body = serde_json::to_string(&result).expect("json serialization should not fail"); let len = body.len(); let response = response - .body(Body::from(body)) + .body(Full::new(Bytes::from(body))) // only fails if invalid status code or invalid header/values are given. // these are not user configurable so it cannot fail dynamically .expect("building response payload should not fail"); @@ -604,9 +611,10 @@ async fn handle_inner( // count the egress bytes - we miss the TLS and header overhead but oh well... // moving this later in the stack is going to be a lot of effort and ehhhh metrics.record_egress(len as u64); - HTTP_CONTENT_LENGTH - .with_label_values(&["response"]) - .observe(len as f64); + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Response, len as f64); Ok(response) } @@ -639,6 +647,7 @@ impl QueryData { } // The query was cancelled. Either::Right((_cancelled, query)) => { + tracing::info!("cancelling query"); if let Err(err) = cancel_token.cancel_query(NoTls).await { tracing::error!(?err, "could not cancel query"); } diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs deleted file mode 100644 index 33f194dd59..0000000000 --- a/proxy/src/serverless/tls_listener.rs +++ /dev/null @@ -1,123 +0,0 @@ -use std::{ - convert::Infallible, - pin::Pin, - task::{Context, Poll}, - time::Duration, -}; - -use hyper::server::{accept::Accept, conn::AddrStream}; -use pin_project_lite::pin_project; -use tokio::{ - io::{AsyncRead, AsyncWrite}, - task::JoinSet, - time::timeout, -}; -use tokio_rustls::{server::TlsStream, TlsAcceptor}; -use tracing::{info, warn, Instrument}; - -use crate::{ - metrics::TLS_HANDSHAKE_FAILURES, - protocol2::{WithClientIp, WithConnectionGuard}, -}; - -pin_project! { - /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself - /// encrypted using TLS. - pub(crate) struct TlsListener { - #[pin] - listener: A, - tls: TlsAcceptor, - waiting: JoinSet>>, - timeout: Duration, - } -} - -impl TlsListener { - /// Create a `TlsListener` with default options. - pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self { - TlsListener { - listener, - tls, - waiting: JoinSet::new(), - timeout, - } - } -} - -impl Accept for TlsListener -where - A: Accept>>, - A::Error: std::error::Error, - A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static, -{ - type Conn = TlsStream; - - type Error = Infallible; - - fn poll_accept( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - let mut this = self.project(); - - loop { - match this.listener.as_mut().poll_accept(cx) { - Poll::Pending => break, - Poll::Ready(Some(Ok(mut conn))) => { - let t = *this.timeout; - let tls = this.tls.clone(); - let span = conn.span.clone(); - this.waiting.spawn(async move { - let peer_addr = match conn.inner.wait_for_addr().await { - Ok(Some(addr)) => addr, - Err(e) => { - tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); - return None; - } - Ok(None) => conn.inner.inner.remote_addr() - }; - - let accept = tls.accept(conn); - match timeout(t, accept).await { - Ok(Ok(conn)) => { - info!(%peer_addr, "accepted new TLS connection"); - Some(conn) - }, - // The handshake failed, try getting another connection from the queue - Ok(Err(e)) => { - TLS_HANDSHAKE_FAILURES.inc(); - warn!(%peer_addr, "failed to accept TLS connection: {e:?}"); - None - } - // The handshake timed out, try getting another connection from the queue - Err(_) => { - TLS_HANDSHAKE_FAILURES.inc(); - warn!(%peer_addr, "failed to accept TLS connection: timeout"); - None - } - } - }.instrument(span)); - } - Poll::Ready(Some(Err(e))) => { - tracing::error!("error accepting TCP connection: {e}"); - continue; - } - Poll::Ready(None) => return Poll::Ready(None), - } - } - - loop { - return match this.waiting.poll_join_next(cx) { - Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))), - // The handshake failed to complete, try getting another connection from the queue - Poll::Ready(Some(Ok(None))) => continue, - // The handshake panicked or was cancelled. ignore and get another connection - Poll::Ready(Some(Err(e))) => { - tracing::warn!("handshake aborted: {e}"); - continue; - } - _ => Poll::Pending, - }; - } - } -} diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index ada6c974f4..b6cd85af73 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -3,9 +3,8 @@ use crate::{ config::ProxyConfig, context::RequestMonitoring, error::{io_error, ReportableError}, - metrics::NUM_CLIENT_CONNECTION_GAUGE, + metrics::Metrics, proxy::{handle_client, ClientMode}, - rate_limiter::EndpointRateLimiter, }; use bytes::{Buf, Bytes}; use futures::{Sink, Stream}; @@ -136,12 +135,12 @@ pub async fn serve_websocket( websocket: HyperWebsocket, cancellation_handler: Arc, hostname: Option, - endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { let websocket = websocket.await?; - let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["ws"]) - .guard(); + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Ws); let res = handle_client( config, @@ -149,7 +148,6 @@ pub async fn serve_websocket( cancellation_handler, WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, - endpoint_rate_limiter, conn_gauge, ) .await; @@ -158,17 +156,15 @@ pub async fn serve_websocket( Err(e) => { // todo: log and push to ctx the error kind ctx.set_error_kind(e.get_error_kind()); - ctx.log(); Err(e.into()) } Ok(None) => { ctx.set_success(); - ctx.log(); Ok(()) } Ok(Some(p)) => { ctx.set_success(); - ctx.log(); + ctx.log_connect(); p.proxy_pass().await } } diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index b6b7a85659..690e92ffb1 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,6 +1,6 @@ use crate::config::TlsServerEndPoint; use crate::error::{ErrorKind, ReportableError, UserFacingError}; -use crate::metrics::TLS_HANDSHAKE_FAILURES; +use crate::metrics::Metrics; use bytes::BytesMut; use pq_proto::framed::{ConnectionError, Framed}; @@ -223,12 +223,20 @@ pub enum StreamUpgradeError { impl Stream { /// If possible, upgrade raw stream into a secure TLS-based stream. - pub async fn upgrade(self, cfg: Arc) -> Result, StreamUpgradeError> { + pub async fn upgrade( + self, + cfg: Arc, + record_handshake_error: bool, + ) -> Result, StreamUpgradeError> { match self { Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg) .accept(raw) .await - .inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?), + .inspect_err(|_| { + if record_handshake_error { + Metrics::get().proxy.tls_handshake_failures.inc() + } + })?), Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 5ffbf95c07..56ed2145dc 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -495,7 +495,7 @@ mod tests { use url::Url; use super::*; - use crate::{http, rate_limiter::RateLimiterConfig, BranchId, EndpointId}; + use crate::{http, BranchId, EndpointId}; #[tokio::test] async fn metrics() { @@ -525,7 +525,7 @@ mod tests { tokio::spawn(server); let metrics = Metrics::default(); - let client = http::new_client(RateLimiterConfig::default()); + let client = http::new_client(); let endpoint = Url::parse(&format!("http://{addr}")).unwrap(); let now = Utc::now(); diff --git a/pyproject.toml b/pyproject.toml index e41aa2cda8..11f410ef43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ psutil = "^5.9.4" types-psutil = "^5.9.5.12" types-toml = "^0.10.8.6" pytest-httpserver = "^1.0.8" -aiohttp = "3.9.2" +aiohttp = "3.9.4" pytest-rerunfailures = "^13.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 50a5a4185b..214de0a77d 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.77.0" +channel = "1.78.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml index 4d136472e0..dd5d453a2b 100644 --- a/s3_scrubber/Cargo.toml +++ b/s3_scrubber/Cargo.toml @@ -22,9 +22,15 @@ serde_with.workspace = true workspace_hack.workspace = true utils.workspace = true async-stream.workspace = true +native-tls.workspace = true +postgres-native-tls.workspace = true +postgres_ffi.workspace = true tokio-stream.workspace = true +tokio-postgres.workspace = true +tokio-util = { workspace = true } futures-util.workspace = true itertools.workspace = true +camino.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } chrono = { workspace = true, default-features = false, features = ["clock", "serde"] } diff --git a/s3_scrubber/README.md b/s3_scrubber/README.md index 2f21b9f191..c1deab8852 100644 --- a/s3_scrubber/README.md +++ b/s3_scrubber/README.md @@ -67,10 +67,12 @@ the purge command will log all the keys that it would have deleted. #### `scan-metadata` -Walk objects in a pageserver S3 bucket, and report statistics on the contents. +Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency. +Errors are logged to stderr and summary to stdout. +For pageserver: ``` -env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata +env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver Timelines: 31106 With errors: 3 @@ -82,6 +84,10 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2 Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053 ``` +For safekeepers, dump_db_connstr and dump_db_table must be +specified; they should point to table with debug dump which will be used +to list timelines and find their backup and start LSNs. + ## Cleaning up running pageservers If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers. diff --git a/s3_scrubber/src/cloud_admin_api.rs b/s3_scrubber/src/cloud_admin_api.rs index 45cac23690..70b108cf23 100644 --- a/s3_scrubber/src/cloud_admin_api.rs +++ b/s3_scrubber/src/cloud_admin_api.rs @@ -1,11 +1,13 @@ -use std::time::Duration; - use chrono::{DateTime, Utc}; +use futures::Future; use hex::FromHex; + use reqwest::{header, Client, StatusCode, Url}; use serde::Deserialize; use tokio::sync::Semaphore; +use tokio_util::sync::CancellationToken; +use utils::backoff; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -137,7 +139,7 @@ pub struct ProjectData { pub region_id: String, pub platform_id: String, pub user_id: String, - pub pageserver_id: u64, + pub pageserver_id: Option, #[serde(deserialize_with = "from_nullable_id")] pub tenant: TenantId, pub safekeepers: Vec, @@ -155,7 +157,7 @@ pub struct ProjectData { pub maintenance_set: Option, } -#[derive(Debug, serde::Deserialize)] +#[derive(Debug, Clone, serde::Deserialize)] pub struct BranchData { pub id: BranchId, pub created_at: DateTime, @@ -210,30 +212,39 @@ impl CloudAdminApiClient { .await .expect("Semaphore is not closed"); - let response = self - .http_client - .get(self.append_url("/projects")) - .query(&[ - ("tenant_id", tenant_id.to_string()), - ("show_deleted", "true".to_string()), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "Find project for tenant".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/projects")) + .query(&[ + ("tenant_id", tenant_id.to_string()), + ("show_deleted", "true".to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "Find project for tenant".to_string(), + ErrorKind::RequestSend(e), + ) + })?; + + let response: AdminApiResponse> = + response.json().await.map_err(|e| { + Error::new( + "Find project for tenant".to_string(), + ErrorKind::BodyRead(e), + ) + })?; + Ok(response) + }, + "find_tenant_project", + ) + .await?; - let response: AdminApiResponse> = response.json().await.map_err(|e| { - Error::new( - "Find project for tenant".to_string(), - ErrorKind::BodyRead(e), - ) - })?; match response.data.len() { 0 => Ok(None), 1 => Ok(Some( @@ -261,42 +272,34 @@ impl CloudAdminApiClient { const PAGINATION_LIMIT: usize = 512; let mut result: Vec = Vec::with_capacity(PAGINATION_LIMIT); loop { - let response = self - .http_client - .get(self.append_url("/projects")) - .query(&[ - ("show_deleted", "false".to_string()), - ("limit", format!("{PAGINATION_LIMIT}")), - ("offset", format!("{pagination_offset}")), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "List active projects".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response_bytes = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/projects")) + .query(&[ + ("show_deleted", "false".to_string()), + ("limit", format!("{PAGINATION_LIMIT}")), + ("offset", format!("{pagination_offset}")), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "List active projects".to_string(), + ErrorKind::RequestSend(e), + ) + })?; - match response.status() { - StatusCode::OK => {} - StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => { - tokio::time::sleep(Duration::from_millis(500)).await; - continue; - } - _status => { - return Err(Error::new( - "List active projects".to_string(), - ErrorKind::ResponseStatus(response.status()), - )) - } - } - - let response_bytes = response.bytes().await.map_err(|e| { - Error::new("List active projects".to_string(), ErrorKind::BodyRead(e)) - })?; + response.bytes().await.map_err(|e| { + Error::new("List active projects".to_string(), ErrorKind::BodyRead(e)) + }) + }, + "list_projects", + ) + .await?; let decode_result = serde_json::from_slice::>>(&response_bytes); @@ -327,6 +330,7 @@ impl CloudAdminApiClient { pub async fn find_timeline_branch( &self, + tenant_id: TenantId, timeline_id: TimelineId, ) -> Result, Error> { let _permit = self @@ -335,43 +339,61 @@ impl CloudAdminApiClient { .await .expect("Semaphore is not closed"); - let response = self - .http_client - .get(self.append_url("/branches")) - .query(&[ - ("timeline_id", timeline_id.to_string()), - ("show_deleted", "true".to_string()), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "Find branch for timeline".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/branches")) + .query(&[ + ("timeline_id", timeline_id.to_string()), + ("show_deleted", "true".to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "Find branch for timeline".to_string(), + ErrorKind::RequestSend(e), + ) + })?; - let response: AdminApiResponse> = response.json().await.map_err(|e| { - Error::new( - "Find branch for timeline".to_string(), - ErrorKind::BodyRead(e), - ) - })?; - match response.data.len() { - 0 => Ok(None), - 1 => Ok(Some( - response - .data - .into_iter() - .next() - .expect("Should have exactly one element"), - )), - too_many => Err(Error::new( - format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"), + let response: AdminApiResponse> = + response.json().await.map_err(|e| { + Error::new( + "Find branch for timeline".to_string(), + ErrorKind::BodyRead(e), + ) + })?; + Ok(response) + }, + "find_timeline_branch", + ) + .await?; + + let mut branches: Vec = response.data.into_iter().collect(); + // Normally timeline_id is unique. However, we do have at least one case + // of the same timeline_id in two different projects, apparently after + // manual recovery. So always recheck project_id (discovered through + // tenant_id). + let project_data = match self.find_tenant_project(tenant_id).await? { + Some(pd) => pd, + None => return Ok(None), + }; + branches.retain(|b| b.project_id == project_data.id); + if branches.len() < 2 { + Ok(branches.first().cloned()) + } else { + Err(Error::new( + format!( + "Find branch for timeline {}/{} returned {} branches instead of 0 or 1", + tenant_id, + timeline_id, + branches.len() + ), ErrorKind::UnexpectedState, - )), + )) } } @@ -532,4 +554,15 @@ impl CloudAdminApiClient { .parse() .unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}")) } + + async fn with_retries(op: O, description: &str) -> Result + where + O: FnMut() -> F, + F: Future>, + { + let cancel = CancellationToken::new(); // not really used + backoff::retry(op, |_| false, 1, 20, description, &cancel) + .await + .expect("cancellations are disabled") + } } diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs index 7a08dffc66..ce0ff10ec6 100644 --- a/s3_scrubber/src/garbage.rs +++ b/s3_scrubber/src/garbage.rs @@ -60,6 +60,7 @@ pub struct GarbageList { /// see garbage, we saw some active tenants too. This protects against classes of bugs /// in the scrubber that might otherwise generate a "deleted all" result. active_tenant_count: usize, + active_timeline_count: usize, } impl GarbageList { @@ -67,6 +68,7 @@ impl GarbageList { Self { items: Vec::new(), active_tenant_count: 0, + active_timeline_count: 0, node_kind, bucket_config, } @@ -119,7 +121,10 @@ pub async fn find_garbage( const S3_CONCURRENCY: usize = 32; // How many concurrent API requests to make to the console API. -const CONSOLE_CONCURRENCY: usize = 128; +// +// Be careful increasing this; roughly we shouldn't have more than ~100 rps. It +// would be better to implement real rsp limiter. +const CONSOLE_CONCURRENCY: usize = 16; struct ConsoleCache { /// Set of tenants found in the control plane API @@ -221,6 +226,7 @@ async fn find_garbage_inner( } else { tracing::debug!("Tenant {tenant_shard_id} is active"); active_tenants.push(tenant_shard_id); + garbage.active_tenant_count = active_tenants.len(); } counter += 1; @@ -261,7 +267,7 @@ async fn find_garbage_inner( let api_client = cloud_admin_api_client.clone(); async move { api_client - .find_timeline_branch(ttid.timeline_id) + .find_timeline_branch(ttid.tenant_shard_id.tenant_id, ttid.timeline_id) .await .map_err(|e| anyhow::anyhow!(e)) .map(|r| (ttid, r)) @@ -271,15 +277,29 @@ async fn find_garbage_inner( std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY)); // Update the GarbageList with any timelines which appear not to exist. + let mut active_timelines: Vec = vec![]; while let Some(result) = timelines_checked.next().await { let (ttid, console_result) = result?; if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) { tracing::debug!("Timeline {ttid} is garbage"); } else { tracing::debug!("Timeline {ttid} is active"); + active_timelines.push(ttid); + garbage.active_timeline_count = active_timelines.len(); } } + let num_garbage_timelines = garbage + .items + .iter() + .filter(|g| matches!(g.entity, GarbageEntity::Timeline(_))) + .count(); + tracing::info!( + "Found {}/{} garbage timelines in active tenants", + num_garbage_timelines, + active_timelines.len(), + ); + Ok(garbage) } @@ -344,16 +364,22 @@ pub async fn get_timeline_objects( const MAX_KEYS_PER_DELETE: usize = 1000; /// Drain a buffer of keys into DeleteObjects requests +/// +/// If `drain` is true, drains keys completely; otherwise stops when < +/// MAX_KEYS_PER_DELETE keys are left. +/// `num_deleted` returns number of deleted keys. async fn do_delete( s3_client: &Arc, bucket_name: &str, keys: &mut Vec, dry_run: bool, drain: bool, + progress_tracker: &mut DeletionProgressTracker, ) -> anyhow::Result<()> { while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) { let request_keys = keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len()))); + let num_deleted = request_keys.len(); if dry_run { tracing::info!("Dry-run deletion of objects: "); for k in request_keys { @@ -368,12 +394,30 @@ async fn do_delete( .send() .await .context("DeleteObjects request")?; + progress_tracker.register(num_deleted); } } Ok(()) } +/// Simple tracker reporting each 10k deleted keys. +#[derive(Default)] +struct DeletionProgressTracker { + num_deleted: usize, + last_reported_num_deleted: usize, +} + +impl DeletionProgressTracker { + fn register(&mut self, n: usize) { + self.num_deleted += n; + if self.num_deleted - self.last_reported_num_deleted > 10000 { + tracing::info!("progress: deleted {} keys", self.num_deleted); + self.last_reported_num_deleted = self.num_deleted; + } + } +} + pub async fn purge_garbage( input_path: String, mode: PurgeMode, @@ -394,6 +438,14 @@ pub async fn purge_garbage( if garbage_list.active_tenant_count == 0 { anyhow::bail!("Refusing to purge a garbage list that reports 0 active tenants"); } + if garbage_list + .items + .iter() + .any(|g| matches!(g.entity, GarbageEntity::Timeline(_))) + && garbage_list.active_timeline_count == 0 + { + anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines"); + } let filtered_items = garbage_list .items @@ -429,6 +481,7 @@ pub async fn purge_garbage( std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY)); let mut objects_to_delete = Vec::new(); + let mut progress_tracker = DeletionProgressTracker::default(); while let Some(result) = get_objects_results.next().await { let mut object_list = result?; objects_to_delete.append(&mut object_list); @@ -439,6 +492,7 @@ pub async fn purge_garbage( &mut objects_to_delete, dry_run, false, + &mut progress_tracker, ) .await?; } @@ -450,10 +504,11 @@ pub async fn purge_garbage( &mut objects_to_delete, dry_run, true, + &mut progress_tracker, ) .await?; - tracing::info!("Fell through"); + tracing::info!("{} keys deleted in total", progress_tracker.num_deleted); Ok(()) } diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index d2842877d0..e976e66748 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -4,7 +4,9 @@ pub mod checks; pub mod cloud_admin_api; pub mod garbage; pub mod metadata_stream; -pub mod scan_metadata; +pub mod scan_pageserver_metadata; +pub mod scan_safekeeper_metadata; +pub mod tenant_snapshot; use std::env; use std::fmt::Display; @@ -23,17 +25,18 @@ use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep}; use aws_sdk_s3::{Client, Config}; use aws_smithy_async::rt::sleep::TokioSleep; +use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; use pageserver::tenant::TENANTS_SEGMENT_NAME; use pageserver_api::shard::TenantShardId; use reqwest::Url; use serde::{Deserialize, Serialize}; -use std::io::IsTerminal; use tokio::io::AsyncReadExt; use tracing::error; use tracing_appender::non_blocking::WorkerGuard; use tracing_subscriber::{fmt, prelude::*, EnvFilter}; -use utils::id::TimelineId; +use utils::fs_ext; +use utils::id::{TenantId, TimelineId}; const MAX_RETRIES: usize = 20; const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN"; @@ -139,12 +142,34 @@ impl RootTarget { pub fn tenants_root(&self) -> S3Target { match self { Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME), - Self::Safekeeper(root) => root.with_sub_segment("wal"), + Self::Safekeeper(root) => root.clone(), } } pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target { - self.tenants_root().with_sub_segment(&tenant_id.to_string()) + match self { + Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()), + Self::Safekeeper(_) => self + .tenants_root() + .with_sub_segment(&tenant_id.tenant_id.to_string()), + } + } + + pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target { + // Only pageserver remote storage contains tenant-shards + assert!(matches!(self, Self::Pageserver(_))); + let Self::Pageserver(root) = self else { + panic!(); + }; + + S3Target { + bucket_name: root.bucket_name.clone(), + prefix_in_bucket: format!( + "{}/{TENANTS_SEGMENT_NAME}/{tenant_id}", + root.prefix_in_bucket + ), + delimiter: root.delimiter.clone(), + } } pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target { @@ -240,7 +265,6 @@ pub fn init_logging(file_name: &str) -> WorkerGuard { .with_ansi(false) .with_writer(file_writer); let stderr_logs = fmt::Layer::new() - .with_ansi(std::io::stderr().is_terminal()) .with_target(false) .with_writer(std::io::stderr); tracing_subscriber::registry() @@ -319,9 +343,7 @@ fn init_remote( }), NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target { bucket_name: bucket_config.bucket, - prefix_in_bucket: bucket_config - .prefix_in_bucket - .unwrap_or("safekeeper/v1".to_string()), + prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()), delimiter, }), }; @@ -346,7 +368,10 @@ async fn list_objects_with_retries( { Ok(response) => return Ok(response), Err(e) => { - error!("list_objects_v2 query failed: {e}"); + error!( + "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}", + s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter + ); tokio::time::sleep(Duration::from_secs(1)).await; } } @@ -396,3 +421,50 @@ async fn download_object_with_retries( anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times") } + +async fn download_object_to_file( + s3_client: &Client, + bucket_name: &str, + key: &str, + version_id: Option<&str>, + local_path: &Utf8Path, +) -> anyhow::Result<()> { + let tmp_path = Utf8PathBuf::from(format!("{local_path}.tmp")); + for _ in 0..MAX_RETRIES { + tokio::fs::remove_file(&tmp_path) + .await + .or_else(fs_ext::ignore_not_found)?; + + let mut file = tokio::fs::File::create(&tmp_path) + .await + .context("Opening output file")?; + + let request = s3_client.get_object().bucket(bucket_name).key(key); + + let request = match version_id { + Some(version_id) => request.version_id(version_id), + None => request, + }; + + let response_stream = match request.send().await { + Ok(response) => response, + Err(e) => { + error!( + "Failed to download object for key {key} version {}: {e:#}", + version_id.unwrap_or("") + ); + tokio::time::sleep(Duration::from_secs(1)).await; + continue; + } + }; + + let mut read_stream = response_stream.body.into_async_read(); + + tokio::io::copy(&mut read_stream, &mut file).await?; + + tokio::fs::rename(&tmp_path, local_path).await?; + return Ok(()); + } + + anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times") +} diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs index 957213856b..e49c280b99 100644 --- a/s3_scrubber/src/main.rs +++ b/s3_scrubber/src/main.rs @@ -1,9 +1,16 @@ +use anyhow::bail; +use camino::Utf8PathBuf; use pageserver_api::shard::TenantShardId; use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; -use s3_scrubber::scan_metadata::scan_metadata; -use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth}; +use s3_scrubber::scan_pageserver_metadata::scan_metadata; +use s3_scrubber::tenant_snapshot::SnapshotDownloader; +use s3_scrubber::{ + init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, + NodeKind, TraversingDepth, +}; use clap::{Parser, Subcommand}; +use utils::id::TenantId; #[derive(Parser)] #[command(author, version, about, long_about = None)] @@ -32,11 +39,28 @@ enum Command { #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)] mode: PurgeMode, }, + #[command(verbatim_doc_comment)] ScanMetadata { + #[arg(short, long)] + node_kind: NodeKind, #[arg(short, long, default_value_t = false)] json: bool, #[arg(long = "tenant-id", num_args = 0..)] tenant_ids: Vec, + #[arg(long, default_value = None)] + /// For safekeeper node_kind only, points to db with debug dump + dump_db_connstr: Option, + /// For safekeeper node_kind only, table in the db with debug dump + #[arg(long, default_value = None)] + dump_db_table: Option, + }, + TenantSnapshot { + #[arg(long = "tenant-id")] + tenant_id: TenantId, + #[arg(long = "concurrency", short = 'j', default_value_t = 8)] + concurrency: usize, + #[arg(short, long)] + output_path: Utf8PathBuf, }, } @@ -50,6 +74,7 @@ async fn main() -> anyhow::Result<()> { Command::ScanMetadata { .. } => "scan", Command::FindGarbage { .. } => "find-garbage", Command::PurgeGarbage { .. } => "purge-garbage", + Command::TenantSnapshot { .. } => "tenant-snapshot", }; let _guard = init_logging(&format!( "{}_{}_{}_{}.log", @@ -60,33 +85,75 @@ async fn main() -> anyhow::Result<()> { )); match cli.command { - Command::ScanMetadata { json, tenant_ids } => { - match scan_metadata(bucket_config.clone(), tenant_ids).await { - Err(e) => { - tracing::error!("Failed: {e}"); - Err(e) + Command::ScanMetadata { + json, + tenant_ids, + node_kind, + dump_db_connstr, + dump_db_table, + } => { + if let NodeKind::Safekeeper = node_kind { + let dump_db_connstr = + dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?; + let dump_db_table = + dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?; + + let summary = scan_safekeeper_metadata( + bucket_config.clone(), + tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(), + dump_db_connstr, + dump_db_table, + ) + .await?; + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); } - Ok(summary) => { - if json { - println!("{}", serde_json::to_string(&summary).unwrap()) - } else { - println!("{}", summary.summary_string()); + if summary.is_fatal() { + bail!("Fatal scrub errors detected"); + } + if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + bail!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + ); + } + Ok(()) + } else { + match scan_metadata(bucket_config.clone(), tenant_ids).await { + Err(e) => { + tracing::error!("Failed: {e}"); + Err(e) } - if summary.is_fatal() { - Err(anyhow::anyhow!("Fatal scrub errors detected")) - } else if summary.is_empty() { - // Strictly speaking an empty bucket is a valid bucket, but if someone ran the - // scrubber they were likely expecting to scan something, and if we see no timelines - // at all then it's likely due to some configuration issues like a bad prefix - Err(anyhow::anyhow!( - "No timelines found in bucket {} prefix {}", - bucket_config.bucket, - bucket_config - .prefix_in_bucket - .unwrap_or("".to_string()) - )) - } else { - Ok(()) + Ok(summary) => { + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); + } + if summary.is_fatal() { + Err(anyhow::anyhow!("Fatal scrub errors detected")) + } else if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + Err(anyhow::anyhow!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + )) + } else { + Ok(()) + } } } } @@ -102,5 +169,14 @@ async fn main() -> anyhow::Result<()> { Command::PurgeGarbage { input_path, mode } => { purge_garbage(input_path, mode, !cli.delete).await } + Command::TenantSnapshot { + tenant_id, + output_path, + concurrency, + } => { + let downloader = + SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?; + downloader.download().await + } } } diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs index 073f37f319..c05874f556 100644 --- a/s3_scrubber/src/metadata_stream.rs +++ b/s3_scrubber/src/metadata_stream.rs @@ -5,7 +5,7 @@ use tokio_stream::Stream; use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId}; use pageserver_api::shard::TenantShardId; -use utils::id::TimelineId; +use utils::id::{TenantId, TimelineId}; /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2 pub fn stream_tenants<'a>( @@ -45,6 +45,62 @@ pub fn stream_tenants<'a>( } } +pub async fn stream_tenant_shards<'a>( + s3_client: &'a Client, + target: &'a RootTarget, + tenant_id: TenantId, +) -> anyhow::Result> + 'a> { + let mut tenant_shard_ids: Vec> = Vec::new(); + let mut continuation_token = None; + let shards_target = target.tenant_shards_prefix(&tenant_id); + + loop { + tracing::info!("Listing in {}", shards_target.prefix_in_bucket); + let fetch_response = + list_objects_with_retries(s3_client, &shards_target, continuation_token.clone()).await; + let fetch_response = match fetch_response { + Err(e) => { + tenant_shard_ids.push(Err(e)); + break; + } + Ok(r) => r, + }; + + let new_entry_ids = fetch_response + .common_prefixes() + .iter() + .filter_map(|prefix| prefix.prefix()) + .filter_map(|prefix| -> Option<&str> { + prefix + .strip_prefix(&target.tenants_root().prefix_in_bucket)? + .strip_suffix('/') + }) + .map(|entry_id_str| { + let first_part = entry_id_str.split('/').next().unwrap(); + + first_part + .parse::() + .with_context(|| format!("Incorrect entry id str: {first_part}")) + }); + + for i in new_entry_ids { + tenant_shard_ids.push(i); + } + + match fetch_response.next_continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + + Ok(stream! { + for i in tenant_shard_ids { + let id = i?; + yield Ok(id); + } + }) +} + /// Given a TenantShardId, output a stream of the timelines within that tenant, discovered /// using ListObjectsv2. The listing is done before the stream is built, so that this /// function can be used to generate concurrency on a stream using buffer_unordered. @@ -58,7 +114,7 @@ pub async fn stream_tenant_timelines<'a>( let timelines_target = target.timelines_root(&tenant); loop { - tracing::info!("Listing in {}", tenant); + tracing::debug!("Listing in {}", tenant); let fetch_response = list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone()) .await; @@ -95,7 +151,7 @@ pub async fn stream_tenant_timelines<'a>( } } - tracing::info!("Yielding for {}", tenant); + tracing::debug!("Yielding for {}", tenant); Ok(stream! { for i in timeline_ids { let id = i?; diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_pageserver_metadata.rs similarity index 100% rename from s3_scrubber/src/scan_metadata.rs rename to s3_scrubber/src/scan_pageserver_metadata.rs diff --git a/s3_scrubber/src/scan_safekeeper_metadata.rs b/s3_scrubber/src/scan_safekeeper_metadata.rs new file mode 100644 index 0000000000..73dd49ceb5 --- /dev/null +++ b/s3_scrubber/src/scan_safekeeper_metadata.rs @@ -0,0 +1,236 @@ +use std::{collections::HashSet, str::FromStr}; + +use aws_sdk_s3::Client; +use futures::stream::{StreamExt, TryStreamExt}; +use pageserver_api::shard::TenantShardId; +use postgres_ffi::{XLogFileName, PG_TLI}; +use serde::Serialize; +use tokio_postgres::types::PgLsn; +use tracing::{error, info, trace}; +use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, +}; + +use crate::{ + cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing, + BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, +}; + +/// Generally we should ask safekeepers, but so far we use everywhere default 16MB. +const WAL_SEGSIZE: usize = 16 * 1024 * 1024; + +#[derive(Serialize)] +pub struct MetadataSummary { + timeline_count: usize, + with_errors: HashSet, + deleted_count: usize, +} + +impl MetadataSummary { + fn new() -> Self { + Self { + timeline_count: 0, + with_errors: HashSet::new(), + deleted_count: 0, + } + } + + pub fn summary_string(&self) -> String { + format!( + "timeline_count: {}, with_errors: {}", + self.timeline_count, + self.with_errors.len() + ) + } + + pub fn is_empty(&self) -> bool { + self.timeline_count == 0 + } + + pub fn is_fatal(&self) -> bool { + !self.with_errors.is_empty() + } +} + +/// Scan the safekeeper metadata in an S3 bucket, reporting errors and +/// statistics. +/// +/// It works by listing timelines along with timeline_start_lsn and backup_lsn +/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL +/// segments are missing, before complaining control plane is queried to check if +/// the project wasn't deleted in the meanwhile. +pub async fn scan_safekeeper_metadata( + bucket_config: BucketConfig, + tenant_ids: Vec, + dump_db_connstr: String, + dump_db_table: String, +) -> anyhow::Result { + info!( + "checking bucket {}, region {}, dump_db_table {}", + bucket_config.bucket, bucket_config.region, dump_db_table + ); + // Use the native TLS implementation (Neon requires TLS) + let tls_connector = + postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap()); + let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?; + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let tenant_filter_clause = if !tenant_ids.is_empty() { + format!( + "and tenant_id in ({})", + tenant_ids + .iter() + .map(|t| format!("'{}'", t)) + .collect::>() + .join(", ") + ) + } else { + "".to_owned() + }; + let query = format!( + "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;", + dump_db_table, tenant_filter_clause, + ); + info!("query is {}", query); + let timelines = client.query(&query, &[]).await?; + info!("loaded {} timelines", timelines.len()); + + let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?; + let console_config = ConsoleConfig::from_env()?; + let cloud_admin_api_client = CloudAdminApiClient::new(console_config); + + let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| { + let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id"); + let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id"); + let timeline_start_lsn_pg: PgLsn = row.get(2); + let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg)); + let backup_lsn_pg: PgLsn = row.get(3); + let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg)); + let ttid = TenantTimelineId::new(tenant_id, timeline_id); + check_timeline( + &s3_client, + &target, + &cloud_admin_api_client, + ttid, + timeline_start_lsn, + backup_lsn, + ) + }); + // Run multiple check_timeline's concurrently. + const CONCURRENCY: usize = 32; + let mut timelines = checks.try_buffered(CONCURRENCY); + + let mut summary = MetadataSummary::new(); + while let Some(r) = timelines.next().await { + let res = r?; + summary.timeline_count += 1; + if !res.is_ok { + summary.with_errors.insert(res.ttid); + } + if res.is_deleted { + summary.deleted_count += 1; + } + } + + Ok(summary) +} + +struct TimelineCheckResult { + ttid: TenantTimelineId, + is_ok: bool, + is_deleted: bool, // timeline is deleted in cplane +} + +/// List s3 and check that is has all expected WAL for the ttid. Consistency +/// errors are logged to stderr; returns Ok(true) if timeline is consistent, +/// Ok(false) if not, Err if failed to check. +async fn check_timeline( + s3_client: &Client, + root: &RootTarget, + api_client: &CloudAdminApiClient, + ttid: TenantTimelineId, + timeline_start_lsn: Lsn, + backup_lsn: Lsn, +) -> anyhow::Result { + trace!( + "checking ttid {}, should contain WAL [{}-{}]", + ttid, + timeline_start_lsn, + backup_lsn + ); + // calculate expected segfiles + let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE); + let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE); + let mut expected_segfiles: HashSet = HashSet::from_iter( + (expected_first_segno..expected_last_segno) + .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)), + ); + let expected_files_num = expected_segfiles.len(); + trace!("expecting {} files", expected_segfiles.len(),); + + // now list s3 and check if it misses something + let ttshid = + TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id); + let mut timeline_dir_target = root.timeline_root(&ttshid); + // stream_listing yields only common_prefixes if delimiter is not empty, but + // we need files, so unset it. + timeline_dir_target.delimiter = String::new(); + + let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target)); + while let Some(obj) = stream.next().await { + let obj = obj?; + let key = obj.key(); + + let seg_name = key + .strip_prefix(&timeline_dir_target.prefix_in_bucket) + .expect("failed to extract segment name"); + expected_segfiles.remove(seg_name); + } + if !expected_segfiles.is_empty() { + // Before complaining check cplane, probably timeline is already deleted. + let bdata = api_client + .find_timeline_branch(ttid.tenant_id, ttid.timeline_id) + .await?; + let deleted = match bdata { + Some(bdata) => bdata.deleted, + None => { + // note: should be careful with selecting proper cplane address + info!("ttid {} not found, assuming it is deleted", ttid); + true + } + }; + if deleted { + // ok, branch is deleted + return Ok(TimelineCheckResult { + ttid, + is_ok: true, + is_deleted: true, + }); + } + error!( + "ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}", + ttid, + expected_segfiles.len(), + expected_files_num, + timeline_start_lsn, + backup_lsn, + ); + return Ok(TimelineCheckResult { + ttid, + is_ok: false, + is_deleted: false, + }); + } + Ok(TimelineCheckResult { + ttid, + is_ok: true, + is_deleted: false, + }) +} diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs new file mode 100644 index 0000000000..4eccad381b --- /dev/null +++ b/s3_scrubber/src/tenant_snapshot.rs @@ -0,0 +1,293 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use crate::checks::{list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData}; +use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines}; +use crate::{ + download_object_to_file, init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, +}; +use anyhow::Context; +use async_stream::stream; +use aws_sdk_s3::Client; +use camino::Utf8PathBuf; +use futures::{StreamExt, TryStreamExt}; +use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; +use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::IndexPart; +use pageserver_api::shard::TenantShardId; +use utils::generation::Generation; +use utils::id::TenantId; + +pub struct SnapshotDownloader { + s3_client: Arc, + s3_root: RootTarget, + bucket_config: BucketConfig, + tenant_id: TenantId, + output_path: Utf8PathBuf, + concurrency: usize, +} + +impl SnapshotDownloader { + pub fn new( + bucket_config: BucketConfig, + tenant_id: TenantId, + output_path: Utf8PathBuf, + concurrency: usize, + ) -> anyhow::Result { + let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?; + Ok(Self { + s3_client, + s3_root, + bucket_config, + tenant_id, + output_path, + concurrency, + }) + } + + async fn download_layer( + &self, + ttid: TenantShardTimelineId, + layer_name: LayerFileName, + layer_metadata: IndexLayerMetadata, + ) -> anyhow::Result<(LayerFileName, IndexLayerMetadata)> { + // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format. They use + // different layer names (remote-style has the generation suffix) + let local_path = self.output_path.join(format!( + "{}/timelines/{}/{}{}", + ttid.tenant_shard_id, + ttid.timeline_id, + layer_name.file_name(), + layer_metadata.generation.get_suffix() + )); + + // We should only be called for layers that are owned by the input TTID + assert_eq!(layer_metadata.shard, ttid.tenant_shard_id.to_index()); + + // Assumption: we always write layer files atomically, and layer files are immutable. Therefore if the file + // already exists on local disk, we assume it is fully correct and skip it. + if tokio::fs::try_exists(&local_path).await? { + tracing::debug!("{} already exists", local_path); + return Ok((layer_name, layer_metadata)); + } else { + tracing::debug!("{} requires download...", local_path); + + let timeline_root = self.s3_root.timeline_root(&ttid); + let remote_layer_path = format!( + "{}{}{}", + timeline_root.prefix_in_bucket, + layer_name.file_name(), + layer_metadata.generation.get_suffix() + ); + + // List versions: the object might be deleted. + let versions = self + .s3_client + .list_object_versions() + .bucket(self.bucket_config.bucket.clone()) + .prefix(&remote_layer_path) + .send() + .await?; + let Some(version) = versions.versions.as_ref().and_then(|v| v.first()) else { + return Err(anyhow::anyhow!("No versions found for {remote_layer_path}")); + }; + download_object_to_file( + &self.s3_client, + &self.bucket_config.bucket, + &remote_layer_path, + version.version_id.as_deref(), + &local_path, + ) + .await?; + + tracing::debug!("Downloaded successfully to {local_path}"); + } + + Ok((layer_name, layer_metadata)) + } + + /// Download many layers belonging to the same TTID, with some concurrency + async fn download_layers( + &self, + ttid: TenantShardTimelineId, + layers: Vec<(LayerFileName, IndexLayerMetadata)>, + ) -> anyhow::Result<()> { + let layer_count = layers.len(); + tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count); + let layers_stream = stream! { + for (layer_name, layer_metadata) in layers { + yield self.download_layer(ttid, layer_name, layer_metadata); + } + }; + + tokio::fs::create_dir_all(self.output_path.join(format!( + "{}/timelines/{}", + ttid.tenant_shard_id, ttid.timeline_id + ))) + .await?; + + let layer_results = layers_stream.buffered(self.concurrency); + let mut layer_results = std::pin::pin!(layer_results); + + let mut err = None; + let mut download_count = 0; + while let Some(i) = layer_results.next().await { + download_count += 1; + match i { + Ok((layer_name, layer_metadata)) => { + tracing::info!( + "[{download_count}/{layer_count}] OK: {} bytes {ttid} {}", + layer_metadata.file_size, + layer_name.file_name() + ); + } + Err(e) => { + // Warn and continue: we will download what we can + tracing::warn!("Download error: {e}"); + err = Some(e); + } + } + } + if let Some(e) = err { + tracing::warn!("Some errors occurred downloading {ttid} layers, last error: {e}"); + Err(e) + } else { + Ok(()) + } + } + + async fn download_timeline( + &self, + ttid: TenantShardTimelineId, + index_part: IndexPart, + index_part_generation: Generation, + ancestor_layers: &mut HashMap< + TenantShardTimelineId, + HashMap, + >, + ) -> anyhow::Result<()> { + let index_bytes = serde_json::to_string(&index_part).unwrap(); + + let layers = index_part + .layer_metadata + .into_iter() + .filter_map(|(layer_name, layer_metadata)| { + if layer_metadata.shard.shard_count != ttid.tenant_shard_id.shard_count { + // Accumulate ancestor layers for later download + let ancestor_ttid = TenantShardTimelineId::new( + TenantShardId { + tenant_id: ttid.tenant_shard_id.tenant_id, + shard_number: layer_metadata.shard.shard_number, + shard_count: layer_metadata.shard.shard_count, + }, + ttid.timeline_id, + ); + let ancestor_ttid_layers = ancestor_layers.entry(ancestor_ttid).or_default(); + use std::collections::hash_map::Entry; + match ancestor_ttid_layers.entry(layer_name) { + Entry::Occupied(entry) => { + // Descendent shards that reference a layer from an ancestor should always have matching metadata, + // as their siblings, because it is read atomically during a shard split. + assert_eq!(entry.get(), &layer_metadata); + } + Entry::Vacant(entry) => { + entry.insert(layer_metadata); + } + } + None + } else { + Some((layer_name, layer_metadata)) + } + }) + .collect(); + + let download_result = self.download_layers(ttid, layers).await; + + // Write index last, once all the layers it references are downloaded + let local_index_path = self.output_path.join(format!( + "{}/timelines/{}/index_part.json{}", + ttid.tenant_shard_id, + ttid.timeline_id, + index_part_generation.get_suffix() + )); + tokio::fs::write(&local_index_path, index_bytes) + .await + .context("writing index")?; + + download_result + } + + pub async fn download(&self) -> anyhow::Result<()> { + let (s3_client, target) = init_remote(self.bucket_config.clone(), NodeKind::Pageserver)?; + + // Generate a stream of TenantShardId + let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?; + let shards: Vec = shards.try_collect().await?; + + // Only read from shards that have the highest count: avoids redundantly downloading + // from ancestor shards. + let Some(shard_count) = shards.iter().map(|s| s.shard_count).max() else { + anyhow::bail!("No shards found"); + }; + + // We will build a collection of layers in anccestor shards to download (this will only + // happen if this tenant has been split at some point) + let mut ancestor_layers: HashMap< + TenantShardTimelineId, + HashMap, + > = Default::default(); + + for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) { + // Generate a stream of TenantTimelineId + let timelines = stream_tenant_timelines(&s3_client, &self.s3_root, shard).await?; + + // Generate a stream of S3TimelineBlobData + async fn load_timeline_index( + s3_client: &Client, + target: &RootTarget, + ttid: TenantShardTimelineId, + ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> { + let data = list_timeline_blobs(s3_client, ttid, target).await?; + Ok((ttid, data)) + } + let timelines = timelines.map_ok(|ttid| load_timeline_index(&s3_client, &target, ttid)); + let mut timelines = std::pin::pin!(timelines.try_buffered(8)); + + while let Some(i) = timelines.next().await { + let (ttid, data) = i?; + match data.blob_data { + BlobDataParseResult::Parsed { + index_part, + index_part_generation, + s3_layers: _, + } => { + self.download_timeline( + ttid, + index_part, + index_part_generation, + &mut ancestor_layers, + ) + .await + .context("Downloading timeline")?; + } + BlobDataParseResult::Relic => {} + BlobDataParseResult::Incorrect(_) => { + tracing::error!("Bad metadata in timeline {ttid}"); + } + }; + } + } + + for (ttid, layers) in ancestor_layers.into_iter() { + tracing::info!( + "Downloading {} layers from ancvestor timeline {ttid}...", + layers.len() + ); + + self.download_layers(ttid, layers.into_iter().collect()) + .await?; + } + + Ok(()) + } +} diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index e53ccaeb3d..09c565ce71 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -177,6 +177,10 @@ struct Args { /// Controls how long backup will wait until uploading the partial segment. #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)] partial_backup_timeout: Duration, + /// Disable task to push messages to broker every second. Supposed to + /// be used in tests. + #[arg(long)] + disable_periodic_broker_push: bool, } // Like PathBufValueParser, but allows empty string. @@ -309,6 +313,7 @@ async fn main() -> anyhow::Result<()> { walsenders_keep_horizon: args.walsenders_keep_horizon, partial_backup_enabled: args.partial_backup_enabled, partial_backup_timeout: args.partial_backup_timeout, + disable_periodic_broker_push: args.disable_periodic_broker_push, }; // initialize sentry if SENTRY_DSN is provided diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 2b1db2714b..98f58d3e49 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -10,11 +10,20 @@ use anyhow::Result; use storage_broker::parse_proto_ttid; use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; +use storage_broker::proto::FilterTenantTimelineId; +use storage_broker::proto::MessageType; +use storage_broker::proto::SafekeeperDiscoveryResponse; +use storage_broker::proto::SubscribeByFilterRequest; use storage_broker::proto::SubscribeSafekeeperInfoRequest; +use storage_broker::proto::TypeSubscription; +use storage_broker::proto::TypedMessage; use storage_broker::Request; +use std::sync::atomic::AtomicU64; +use std::sync::Arc; use std::time::Duration; use std::time::Instant; +use std::time::UNIX_EPOCH; use tokio::task::JoinHandle; use tokio::time::sleep; use tracing::*; @@ -31,6 +40,12 @@ const PUSH_INTERVAL_MSEC: u64 = 1000; /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { + if conf.disable_periodic_broker_push { + info!("broker push_loop is disabled, doing nothing..."); + futures::future::pending::<()>().await; // sleep forever + return Ok(()); + } + let mut client = storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?; let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); @@ -75,7 +90,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { } /// Subscribe and fetch all the interesting data from the broker. -async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { +async fn pull_loop(conf: SafeKeeperConf, stats: Arc) -> Result<()> { let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?; // TODO: subscribe only to local timelines instead of all @@ -94,6 +109,8 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]); while let Some(msg) = stream.message().await? { + stats.update_pulled(); + let proto_ttid = msg .tenant_timeline_id .as_ref() @@ -119,12 +136,93 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { bail!("end of stream"); } +/// Process incoming discover requests. This is done in a separate task to avoid +/// interfering with the normal pull/push loops. +async fn discover_loop(conf: SafeKeeperConf, stats: Arc) -> Result<()> { + let mut client = + storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?; + + let request = SubscribeByFilterRequest { + types: vec![TypeSubscription { + r#type: MessageType::SafekeeperDiscoveryRequest as i32, + }], + tenant_timeline_id: Some(FilterTenantTimelineId { + enabled: false, + tenant_timeline_id: None, + }), + }; + + let mut stream = client + .subscribe_by_filter(request) + .await + .context("subscribe_by_filter request failed")? + .into_inner(); + + let discover_counter = BROKER_PULLED_UPDATES.with_label_values(&["discover"]); + + while let Some(typed_msg) = stream.message().await? { + stats.update_pulled(); + + match typed_msg.r#type() { + MessageType::SafekeeperDiscoveryRequest => { + let msg = typed_msg + .safekeeper_discovery_request + .expect("proto type mismatch from broker message"); + + let proto_ttid = msg + .tenant_timeline_id + .as_ref() + .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?; + let ttid = parse_proto_ttid(proto_ttid)?; + if let Ok(tli) = GlobalTimelines::get(ttid) { + // we received a discovery request for a timeline we know about + discover_counter.inc(); + + // create and reply with discovery response + let sk_info = tli.get_safekeeper_info(&conf).await; + let response = SafekeeperDiscoveryResponse { + safekeeper_id: sk_info.safekeeper_id, + tenant_timeline_id: sk_info.tenant_timeline_id, + commit_lsn: sk_info.commit_lsn, + safekeeper_connstr: sk_info.safekeeper_connstr, + availability_zone: sk_info.availability_zone, + }; + + // note this is a blocking call + client + .publish_one(TypedMessage { + r#type: MessageType::SafekeeperDiscoveryResponse as i32, + safekeeper_timeline_info: None, + safekeeper_discovery_request: None, + safekeeper_discovery_response: Some(response), + }) + .await?; + } + } + + _ => { + warn!( + "unexpected message type i32 {}, {:?}", + typed_msg.r#type, + typed_msg.r#type() + ); + } + } + } + bail!("end of stream"); +} + pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { info!("started, broker endpoint {:?}", conf.broker_endpoint); let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC)); let mut push_handle: Option>> = None; let mut pull_handle: Option>> = None; + let mut discover_handle: Option>> = None; + + let stats = Arc::new(BrokerStats::new()); + let stats_task = task_stats(stats.clone()); + tokio::pin!(stats_task); // Selecting on JoinHandles requires some squats; is there a better way to // reap tasks individually? @@ -153,13 +251,77 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { }; pull_handle = None; }, + res = async { discover_handle.as_mut().unwrap().await }, if discover_handle.is_some() => { + // was it panic or normal error? + match res { + Ok(res_internal) => if let Err(err_inner) = res_internal { + warn!("discover task failed: {:?}", err_inner); + } + Err(err_outer) => { warn!("discover task panicked: {:?}", err_outer) } + }; + discover_handle = None; + }, _ = ticker.tick() => { if push_handle.is_none() { push_handle = Some(tokio::spawn(push_loop(conf.clone()))); } if pull_handle.is_none() { - pull_handle = Some(tokio::spawn(pull_loop(conf.clone()))); + pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone()))); } + if discover_handle.is_none() { + discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone()))); + } + }, + _ = &mut stats_task => {} + } + } +} + +struct BrokerStats { + /// Timestamp of the last received message from the broker. + last_pulled_ts: AtomicU64, +} + +impl BrokerStats { + fn new() -> Self { + BrokerStats { + last_pulled_ts: AtomicU64::new(0), + } + } + + fn now_millis() -> u64 { + std::time::SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("time is before epoch") + .as_millis() as u64 + } + + /// Update last_pulled timestamp to current time. + fn update_pulled(&self) { + self.last_pulled_ts + .store(Self::now_millis(), std::sync::atomic::Ordering::Relaxed); + } +} + +/// Periodically write to logs if there are issues with receiving data from the broker. +async fn task_stats(stats: Arc) { + let warn_duration = Duration::from_secs(10); + let mut ticker = tokio::time::interval(warn_duration); + + loop { + tokio::select! { + _ = ticker.tick() => { + let last_pulled = stats.last_pulled_ts.load(std::sync::atomic::Ordering::SeqCst); + if last_pulled == 0 { + // no broker updates yet + continue; + } + + let now = BrokerStats::now_millis(); + if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 { + let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp"); + info!("no broker updates for some time, last update: {:?}", ts); + } } } } diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 9b4d4dbb38..543714a54e 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -83,6 +83,7 @@ pub struct SafeKeeperConf { pub walsenders_keep_horizon: bool, pub partial_backup_enabled: bool, pub partial_backup_timeout: Duration, + pub disable_periodic_broker_push: bool, } impl SafeKeeperConf { @@ -129,6 +130,7 @@ impl SafeKeeperConf { walsenders_keep_horizon: false, partial_backup_enabled: false, partial_backup_timeout: Duration::from_secs(0), + disable_periodic_broker_push: false, } } } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index f2ee0403eb..e671d4f36a 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -725,6 +725,18 @@ where self.state.inmem.commit_lsn ); + // Before first WAL write initialize its segment. It makes first segment + // pg_waldump'able because stream from compute doesn't include its + // segment and page headers. + // + // If we fail before first WAL write flush this action would be + // repeated, that's ok because it is idempotent. + if self.wal_store.flush_lsn() == Lsn::INVALID { + self.wal_store + .initialize_first_segment(msg.start_streaming_at) + .await?; + } + // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to // intersection of our history and history from msg @@ -1007,6 +1019,10 @@ mod tests { self.lsn } + async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> { + Ok(()) + } + async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { self.lsn = startpos + buf.len() as u64; Ok(()) diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index e3f6a606a0..e496f07114 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -18,7 +18,7 @@ use std::time::Duration; use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; use postgres_ffi::XLogFileName; use postgres_ffi::{XLogSegNo, PG_TLI}; -use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata}; +use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata}; use tokio::fs::File; use tokio::select; @@ -601,12 +601,18 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { backoff::retry( || async { // Do list-delete in batch_size batches to make progress even if there a lot of files. - // Alternatively we could make list_files return iterator, but it is more complicated and + // Alternatively we could make remote storage list return iterator, but it is more complicated and // I'm not sure deleting while iterating is expected in s3. loop { let files = storage - .list_files(Some(&remote_path), Some(batch_size), &cancel) - .await?; + .list( + Some(&remote_path), + ListingMode::NoDelimiter, + Some(batch_size), + &cancel, + ) + .await? + .keys; if files.is_empty() { return Ok(()); // done } @@ -666,8 +672,9 @@ pub async fn copy_s3_segments( let cancel = CancellationToken::new(); let files = storage - .list_files(Some(&remote_path), None, &cancel) - .await?; + .list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel) + .await? + .keys; let uploaded_segments = &files .iter() diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 147f318b9f..6bc8c7c3f9 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -38,6 +38,12 @@ pub trait Storage { /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; + /// Initialize segment by creating proper long header at the beginning of + /// the segment and short header at the page of given LSN. This is only used + /// for timeline initialization because compute will stream data only since + /// init_lsn. Other segment headers are included in compute stream. + async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()>; + /// Write piece of WAL from buf to disk, but not necessarily sync it. async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; @@ -78,6 +84,8 @@ pub struct PhysicalStorage { /// Size of WAL segment in bytes. wal_seg_size: usize, + pg_version: u32, + system_id: u64, /// Written to disk, but possibly still in the cache and not fully persisted. /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. @@ -169,6 +177,8 @@ impl PhysicalStorage { timeline_dir, conf: conf.clone(), wal_seg_size, + pg_version: state.server.pg_version, + system_id: state.server.system_id, write_lsn, write_record_lsn: write_lsn, flush_record_lsn: flush_lsn, @@ -324,6 +334,20 @@ impl Storage for PhysicalStorage { self.flush_record_lsn } + async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()> { + let segno = init_lsn.segment_number(self.wal_seg_size); + let (mut file, _) = self.open_or_create(segno).await?; + let major_pg_version = self.pg_version / 10000; + let wal_seg = + postgres_ffi::generate_wal_segment(segno, self.system_id, major_pg_version, init_lsn)?; + file.seek(SeekFrom::Start(0)).await?; + file.write_all(&wal_seg).await?; + file.flush().await?; + info!("initialized segno {} at lsn {}", segno, init_lsn); + // note: file is *not* fsynced + Ok(()) + } + /// Write WAL to disk. async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { // Disallow any non-sequential writes, which can result in gaps or overwrites. diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index bc21c4d765..27e2a4453b 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -178,6 +178,7 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { walsenders_keep_horizon: false, partial_backup_enabled: false, partial_backup_timeout: Duration::from_secs(0), + disable_periodic_broker_push: false, }; let mut global = GlobalMap::new(disk, conf.clone())?; diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs index 35bca325aa..c2db9de78a 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs @@ -182,6 +182,10 @@ impl wal_storage::Storage for DiskWALStorage { self.flush_record_lsn } + async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> { + Ok(()) + } + /// Write piece of WAL from buf to disk, but not necessarily sync it. async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { if self.write_lsn != startpos { diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py deleted file mode 100755 index 84b69cb36a..0000000000 --- a/scripts/export_import_between_pageservers.py +++ /dev/null @@ -1,730 +0,0 @@ -# -# Script to export tenants from one pageserver and import them into another page server. -# -# Outline of steps: -# 1. Get `(last_lsn, prev_lsn)` from old pageserver -# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file -# 3. This tar file might be missing relation files for empty relations, if the pageserver -# is old enough (we didn't always store those). So to recreate them, we start a local -# vanilla postgres on this basebackup and ask it what relations should exist, then touch -# any missing files and re-pack the tar. -# TODO This functionality is no longer needed, so we can delete it later if we don't -# end up using the same utils for the pg 15 upgrade. Not sure. -# 4. We import the patched basebackup into a new pageserver -# 5. We export again via fullbackup, now from the new pageserver and compare the returned -# tar file with the one we imported. This confirms that we imported everything that was -# exported, but doesn't guarantee correctness (what if we didn't **export** everything -# initially?) -# 6. We wait for the new pageserver's remote_consistent_lsn to catch up -# -# For more context on how to use this, see: -# https://www.notion.so/neondatabase/Storage-format-migration-9a8eba33ccf8417ea8cf50e6a0c542cf - -import argparse -import os -import shutil -import subprocess -import tempfile -import time -import uuid -from contextlib import closing -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, cast - -import psycopg2 -import requests -from psycopg2.extensions import connection as PgConnection -from psycopg2.extensions import parse_dsn - -############################################### -### client-side utils copied from test fixtures -############################################### - -Env = Dict[str, str] - -_global_counter = 0 - - -def global_counter() -> int: - """A really dumb global counter. - This is useful for giving output files a unique number, so if we run the - same command multiple times we can keep their output separate. - """ - global _global_counter - _global_counter += 1 - return _global_counter - - -def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: - """Run a process and capture its output - Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" - where "cmd" is the name of the program and NNN is an incrementing - counter. - If those files already exist, we will overwrite them. - Returns basepath for files with captured output. - """ - assert isinstance(cmd, list) - base = f"{os.path.basename(cmd[0])}_{global_counter()}" - basepath = os.path.join(capture_dir, base) - stdout_filename = basepath + ".stdout" - stderr_filename = basepath + ".stderr" - - with open(stdout_filename, "w") as stdout_f: - with open(stderr_filename, "w") as stderr_f: - print(f'(capturing output to "{base}.stdout")') - subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) - - return basepath - - -class PgBin: - """A helper class for executing postgres binaries""" - - def __init__(self, log_dir: Path, pg_distrib_dir, pg_version): - self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin") - self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib") - - def _fixpath(self, command: List[str]): - if "/" not in command[0]: - command[0] = os.path.join(self.pg_bin_path, command[0]) - - def _build_env(self, env_add: Optional[Env]) -> Env: - if env_add is None: - return self.env - env = self.env.copy() - env.update(env_add) - return env - - def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None): - """ - Run one of the postgres binaries. - The command should be in list form, e.g. ['pgbench', '-p', '55432'] - All the necessary environment variables will be set. - If the first argument (the command name) doesn't include a path (no '/' - characters present), then it will be edited to include the correct path. - If you want stdout/stderr captured to files, use `run_capture` instead. - """ - - self._fixpath(command) - print(f'Running command "{" ".join(command)}"') - env = self._build_env(env) - subprocess.run(command, env=env, cwd=cwd, check=True) - - def run_capture( - self, - command: List[str], - env: Optional[Env] = None, - cwd: Optional[str] = None, - **kwargs: Any, - ) -> str: - """ - Run one of the postgres binaries, with stderr and stdout redirected to a file. - This is just like `run`, but for chatty programs. Returns basepath for files - with captured output. - """ - - self._fixpath(command) - print(f'Running command "{" ".join(command)}"') - env = self._build_env(env) - return subprocess_capture( - str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs - ) - - -class PgProtocol: - """Reusable connection logic""" - - def __init__(self, **kwargs): - self.default_options = kwargs - - def conn_options(self, **kwargs): - conn_options = self.default_options.copy() - if "dsn" in kwargs: - conn_options.update(parse_dsn(kwargs["dsn"])) - conn_options.update(kwargs) - - # Individual statement timeout in seconds. 2 minutes should be - # enough for our tests, but if you need a longer, you can - # change it by calling "SET statement_timeout" after - # connecting. - conn_options["options"] = f"-cstatement_timeout=120s {conn_options.get('options', '')}" - - return conn_options - - # autocommit=True here by default because that's what we need most of the time - def connect(self, autocommit=True, **kwargs) -> PgConnection: - """ - Connect to the node. - Returns psycopg2's connection object. - This method passes all extra params to connstr. - """ - conn: PgConnection = psycopg2.connect(**self.conn_options(**kwargs)) - - # WARNING: this setting affects *all* tests! - conn.autocommit = autocommit - return conn - - def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]: - """ - Execute query against the node and return all rows. - This method passes all extra params to connstr. - """ - return self.safe_psql_many([query], **kwargs)[0] - - def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]: - """ - Execute queries against the node and return all rows. - This method passes all extra params to connstr. - """ - result: List[List[Any]] = [] - with closing(self.connect(**kwargs)) as conn: - with conn.cursor() as cur: - for query in queries: - print(f"Executing query: {query}") - cur.execute(query) - - if cur.description is None: - result.append([]) # query didn't return data - else: - result.append(cast(List[Any], cur.fetchall())) - return result - - -class VanillaPostgres(PgProtocol): - def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True): - super().__init__(host="localhost", port=port, dbname="postgres") - self.pgdatadir = pgdatadir - self.pg_bin = pg_bin - self.running = False - if init: - self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)]) - self.configure([f"port = {port}\n"]) - - def configure(self, options: List[str]): - """Append lines into postgresql.conf file.""" - assert not self.running - with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: - conf_file.write("\n".join(options)) - - def start(self, log_path: Optional[str] = None): - assert not self.running - self.running = True - - log_path = log_path or os.path.join(self.pgdatadir, "pg.log") - - self.pg_bin.run_capture( - ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"] - ) - - def stop(self): - assert self.running - self.running = False - self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"]) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc, tb): - if self.running: - self.stop() - - -class NeonPageserverApiException(Exception): - pass - - -class NeonPageserverHttpClient(requests.Session): - def __init__(self, host, port): - super().__init__() - self.host = host - self.port = port - - def verbose_error(self, res: requests.Response): - try: - res.raise_for_status() - except requests.RequestException as e: - try: - msg = res.json()["msg"] - except: # noqa: E722 - msg = "" - raise NeonPageserverApiException(msg) from e - - def check_status(self): - self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status() - - def tenant_list(self): - res = self.get(f"http://{self.host}:{self.port}/v1/tenant") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists): - res = self.post( - f"http://{self.host}:{self.port}/v1/tenant", - json={"new_tenant_id": new_tenant_id.hex, "generation": 1}, - ) - - if res.status_code == 409: - if ok_if_exists: - print(f"could not create tenant: already exists for id {new_tenant_id}") - else: - res.raise_for_status() - elif res.status_code == 201: - print(f"created tenant {new_tenant_id}") - else: - self.verbose_error(res) - - return new_tenant_id - - def timeline_list(self, tenant_id: uuid.UUID): - res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=true" - ) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - -def lsn_to_hex(num: int) -> str: - """Convert lsn from int to standard hex notation.""" - return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}" - - -def lsn_from_hex(lsn_hex: str) -> int: - """Convert lsn from hex notation to int.""" - left, right = lsn_hex.split("/") - return (int(left, 16) << 32) + int(right, 16) - - -def remote_consistent_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID -) -> int: - detail = pageserver_http_client.timeline_detail(tenant, timeline) - - lsn_str = detail["remote_consistent_lsn"] - assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) - - -def wait_for_upload( - pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID, - lsn: int, -): - """waits for local timeline upload up to specified lsn""" - for i in range(10): - current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) - if current_lsn >= lsn: - return - print( - f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}" - ) - time.sleep(1) - - raise Exception( - f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}" - ) - - -############## -# End of utils -############## - - -def pack_base(log_dir, restored_dir, output_tar): - """Create tar file from basebackup, being careful to produce relative filenames.""" - tmp_tar_name = "tmp.tar" - tmp_tar_path = os.path.join(restored_dir, tmp_tar_name) - cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir) - # We actually cd into the dir and call tar from there. If we call tar from - # outside we won't encode filenames as relative, and they won't parse well - # on import. - subprocess_capture(log_dir, cmd, cwd=restored_dir) - shutil.move(tmp_tar_path, output_tar) - - -def reconstruct_paths(log_dir, pg_bin, base_tar, port: int): - """Reconstruct what relation files should exist in the datadir by querying postgres.""" - with tempfile.TemporaryDirectory() as restored_dir: - # Unpack the base tar - subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir]) - - # Start a vanilla postgres from the given datadir and query it to find - # what relfiles should exist, but possibly don't. - with VanillaPostgres(Path(restored_dir), pg_bin, port, init=False) as vanilla_pg: - vanilla_pg.configure([f"port={port}"]) - vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log")) - - # Create database based on template0 because we can't connect to template0 - query = "create database template0copy template template0" - vanilla_pg.safe_psql(query, user="cloud_admin") - vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin") - - # Get all databases - query = "select oid, datname from pg_database" - oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin") - template0_oid = [ - oid for (oid, database) in oid_dbname_pairs if database == "template0" - ][0] - - # Get rel paths for each database - for oid, database in oid_dbname_pairs: - if database == "template0": - # We can't connect to template0 - continue - - query = "select relname, pg_relation_filepath(oid) from pg_class" - result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database) - for _relname, filepath in result: - if filepath is not None: - if database == "template0copy": - # Add all template0copy paths to template0 - prefix = f"base/{oid}/" - if filepath.startswith(prefix): - suffix = filepath[len(prefix) :] - yield f"base/{template0_oid}/{suffix}" - elif filepath.startswith("global"): - print(f"skipping {database} global file {filepath}") - else: - raise AssertionError - else: - yield filepath - - -def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths): - """Add the appropriate empty files to a basebadkup tar.""" - with tempfile.TemporaryDirectory() as restored_dir: - # Unpack the base tar - subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir]) - - # Touch files that don't exist - for path in paths: - absolute_path = os.path.join(restored_dir, path) - exists = os.path.exists(absolute_path) - if not exists: - print(f"File {absolute_path} didn't exist. Creating..") - Path(absolute_path).touch() - - # Repackage - pack_base(log_dir, restored_dir, output_tar) - - -# HACK This is a workaround for exporting from old pageservers that -# can't export empty relations. In this case we need to start -# a vanilla postgres from the exported datadir, and query it -# to see what empty relations are missing, and then create -# those empty files before importing. -def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int): - reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar, tmp_pg_port)) - touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths) - - -def get_rlsn(pageserver_connstr, tenant_id, timeline_id): - with closing(psycopg2.connect(pageserver_connstr)) as conn: - conn.autocommit = True - with conn.cursor() as cur: - cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}" - cur.execute(cmd) - res = cur.fetchone() - assert res is not None - prev_lsn = res[0] - last_lsn = res[1] - - return last_lsn, prev_lsn - - -def import_timeline( - args, - psql_path, - pageserver_connstr, - pageserver_http, - tenant_id, - timeline_id, - last_lsn, - prev_lsn, - tar_filename, - pg_version, -): - # Import timelines to new pageserver - import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}" - full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """ - - stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr") - stdout_filename = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout") - - print(f"Running: {full_cmd}") - - with open(stdout_filename, "w") as stdout_f: - with open(stderr_filename2, "w") as stderr_f: - print(f"(capturing output to {stdout_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) - subprocess.run( - full_cmd, - stdout=stdout_f, - stderr=stderr_f, - env=pg_bin._build_env(None), - shell=True, - check=True, - ) - - print("Done import") - - # Wait until pageserver persists the files - wait_for_upload( - pageserver_http, uuid.UUID(tenant_id), uuid.UUID(timeline_id), lsn_from_hex(last_lsn) - ) - - -def export_timeline( - args, - psql_path, - pageserver_connstr, - tenant_id, - timeline_id, - last_lsn, - prev_lsn, - tar_filename, - pg_version, -): - # Choose filenames - incomplete_filename = tar_filename + ".incomplete" - stderr_filename = os.path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr") - - # Construct export command - query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}" - cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query] - - # Run export command - print(f"Running: {cmd}") - with open(incomplete_filename, "w") as stdout_f: - with open(stderr_filename, "w") as stderr_f: - print(f"(capturing output to {incomplete_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) - subprocess.run( - cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True - ) - - # Add missing rels - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) - add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin, args.tmp_pg_port) - - # Log more info - file_size = os.path.getsize(tar_filename) - print(f"Done export: {tar_filename}, size {file_size}") - - -def main(args: argparse.Namespace): - # any psql version will do here. use current DEFAULT_PG_VERSION = 15 - psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql") - - old_pageserver_host = args.old_pageserver_host - new_pageserver_host = args.new_pageserver_host - - old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port) - old_http_client.check_status() - old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}" - - new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port) - new_http_client.check_status() - new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}" - - for tenant_id in args.tenants: - print(f"Tenant: {tenant_id}") - timelines = old_http_client.timeline_list(uuid.UUID(tenant_id)) - print(f"Timelines: {timelines}") - - # Create tenant in new pageserver - if args.only_import is False and not args.timelines: - new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists) - - for timeline in timelines: - # Skip timelines we don't need to export - if args.timelines and timeline["timeline_id"] not in args.timelines: - print(f"Skipping timeline {timeline['timeline_id']}") - continue - - # Choose filenames - tar_filename = os.path.join( - args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar" - ) - - pg_version = timeline["pg_version"] - - # Export timeline from old pageserver - if args.only_import is False: - last_lsn, prev_lsn = get_rlsn( - old_pageserver_connstr, - timeline["tenant_id"], - timeline["timeline_id"], - ) - export_timeline( - args, - psql_path, - old_pageserver_connstr, - timeline["tenant_id"], - timeline["timeline_id"], - last_lsn, - prev_lsn, - tar_filename, - pg_version, - ) - - # Import into new pageserver - import_timeline( - args, - psql_path, - new_pageserver_connstr, - new_http_client, - timeline["tenant_id"], - timeline["timeline_id"], - last_lsn, - prev_lsn, - tar_filename, - pg_version, - ) - - # Re-export and compare - re_export_filename = tar_filename + ".reexport" - export_timeline( - args, - psql_path, - new_pageserver_connstr, - timeline["tenant_id"], - timeline["timeline_id"], - last_lsn, - prev_lsn, - re_export_filename, - pg_version, - ) - - # Check the size is the same - old_size = (os.path.getsize(tar_filename),) - new_size = (os.path.getsize(re_export_filename),) - if old_size != new_size: - raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}") - - -def non_zero_tcp_port(arg: Any): - port = int(arg) - if port < 1 or port > 65535: - raise argparse.ArgumentTypeError(f"invalid tcp port: {arg}") - return port - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--tenant-id", - dest="tenants", - required=True, - nargs="+", - help="Id of the tenant to migrate. You can pass multiple arguments", - ) - parser.add_argument( - "--timeline-id", - dest="timelines", - required=False, - nargs="+", - help="Id of the timeline to migrate. You can pass multiple arguments", - ) - parser.add_argument( - "--from-host", - dest="old_pageserver_host", - required=True, - help="Host of the pageserver to migrate data from", - ) - parser.add_argument( - "--from-http-port", - dest="old_pageserver_http_port", - required=False, - type=int, - default=9898, - help="HTTP port of the pageserver to migrate data from. Default: 9898", - ) - parser.add_argument( - "--from-pg-port", - dest="old_pageserver_pg_port", - required=False, - type=int, - default=6400, - help="pg port of the pageserver to migrate data from. Default: 6400", - ) - parser.add_argument( - "--to-host", - dest="new_pageserver_host", - required=True, - help="Host of the pageserver to migrate data to", - ) - parser.add_argument( - "--to-http-port", - dest="new_pageserver_http_port", - required=False, - default=9898, - type=int, - help="HTTP port of the pageserver to migrate data to. Default: 9898", - ) - parser.add_argument( - "--to-pg-port", - dest="new_pageserver_pg_port", - required=False, - default=6400, - type=int, - help="pg port of the pageserver to migrate data to. Default: 6400", - ) - parser.add_argument( - "--ignore-tenant-exists", - dest="ok_if_exists", - required=False, - help="Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.", - ) - parser.add_argument( - "--pg-distrib-dir", - dest="pg_distrib_dir", - required=False, - default="/usr/local/", - help="Path where postgres binaries are installed. Default: /usr/local/", - ) - parser.add_argument( - "--psql-path", - dest="psql_path", - required=False, - default="/usr/local/v14/bin/psql", - help="Path to the psql binary. Default: /usr/local/v14/bin/psql", - ) - parser.add_argument( - "--only-import", - dest="only_import", - required=False, - default=False, - action="store_true", - help="Skip export and tenant creation part", - ) - parser.add_argument( - "--work-dir", - dest="work_dir", - required=True, - default=False, - help="directory where temporary tar files are stored", - ) - parser.add_argument( - "--tmp-pg-port", - dest="tmp_pg_port", - required=False, - default=55439, - type=non_zero_tcp_port, - help="localhost port to use for temporary postgres instance", - ) - args = parser.parse_args() - main(args) diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index 853c67d218..878840fcee 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -15,8 +15,7 @@ FLAKY_TESTS_QUERY = """ DISTINCT parent_suite, suite, name FROM results WHERE - started_at > CURRENT_DATE - INTERVAL '10' day - AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs` + started_at > CURRENT_DATE - INTERVAL '%s' day AND ( (status IN ('failed', 'broken') AND reference = 'refs/heads/main') OR flaky diff --git a/scripts/sk_cleanup_tenants/script.py b/scripts/sk_cleanup_tenants/script.py index fa22433614..c20a4bb830 100644 --- a/scripts/sk_cleanup_tenants/script.py +++ b/scripts/sk_cleanup_tenants/script.py @@ -22,7 +22,7 @@ parser.add_argument("--safekeeper-host", required=True, type=str) args = parser.parse_args() access_key = os.getenv("CONSOLE_API_TOKEN") -endpoint: str = "https://console.stage.neon.tech/api" +endpoint: str = "https://console-stage.neon.build/api" trash_dir: Path = args.trash_dir dry_run: bool = args.dry_run diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md index 7494a6cb78..5ae55e058b 100644 --- a/scripts/sk_collect_dumps/readme.md +++ b/scripts/sk_collect_dumps/readme.md @@ -3,7 +3,7 @@ 3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key): ``` # staging: -AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') +AUTH_TOKEN=$(curl https://console-stage.neon.build/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') # prod: AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') # check diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 4e5f8ed724..8c88b61abc 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -196,8 +196,13 @@ impl SubscriptionKey { /// Parse from FilterTenantTimelineId pub fn from_proto_filter_tenant_timeline_id( - f: &FilterTenantTimelineId, + opt: Option<&FilterTenantTimelineId>, ) -> Result { + if opt.is_none() { + return Ok(SubscriptionKey::All); + } + + let f = opt.unwrap(); if !f.enabled { return Ok(SubscriptionKey::All); } @@ -534,10 +539,7 @@ impl BrokerService for Broker { .remote_addr() .expect("TCPConnectInfo inserted by handler"); let proto_filter = request.into_inner(); - let ttid_filter = proto_filter - .tenant_timeline_id - .as_ref() - .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?; + let ttid_filter = proto_filter.tenant_timeline_id.as_ref(); let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?; let types_set = proto_filter diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 165cafaf4e..194619a496 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -31,7 +31,7 @@ once_cell.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true postgres_connection.workspace = true -reqwest.workspace = true +reqwest = { workspace = true, features = ["stream"] } routerify.workspace = true serde.workspace = true serde_json.workspace = true @@ -40,6 +40,8 @@ tokio.workspace = true tokio-util.workspace = true tracing.workspace = true measured.workspace = true +strum.workspace = true +strum_macros.workspace = true diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] } diesel_migrations = { version = "2.1.0" } diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index eb0c4472e4..9d326ef82d 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -3,11 +3,13 @@ use std::{collections::HashMap, time::Duration}; use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::local_env::LocalEnv; -use hyper::{Method, StatusCode}; +use futures::StreamExt; +use hyper::StatusCode; use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; use postgres_connection::parse_host_port; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; +use tracing::{info_span, Instrument}; use utils::{ backoff::{self}, id::{NodeId, TenantId}, @@ -17,6 +19,8 @@ use crate::service::Config; const SLOWDOWN_DELAY: Duration = Duration::from_secs(5); +const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); + pub(crate) const API_CONCURRENCY: usize = 32; struct UnshardedComputeHookTenant { @@ -242,6 +246,10 @@ pub(super) struct ComputeHook { // This lock is only used in testing enviroments, to serialize calls into neon_lock neon_local_lock: tokio::sync::Mutex<()>, + + // We share a client across all notifications to enable connection re-use etc when + // sending large numbers of notifications + client: reqwest::Client, } impl ComputeHook { @@ -251,12 +259,18 @@ impl ComputeHook { .clone() .map(|jwt| format!("Bearer {}", jwt)); + let client = reqwest::ClientBuilder::new() + .timeout(NOTIFY_REQUEST_TIMEOUT) + .build() + .expect("Failed to construct HTTP client"); + Self { state: Default::default(), config, authorization_header, neon_local_lock: Default::default(), api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), + client, } } @@ -310,12 +324,11 @@ impl ComputeHook { async fn do_notify_iteration( &self, - client: &reqwest::Client, url: &String, reconfigure_request: &ComputeHookNotifyRequest, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let req = client.request(Method::PUT, url); + let req = self.client.request(reqwest::Method::PUT, url); let req = if let Some(value) = &self.authorization_header { req.header(reqwest::header::AUTHORIZATION, value) } else { @@ -334,8 +347,10 @@ impl ComputeHook { }; // Treat all 2xx responses as success - if response.status() >= StatusCode::OK && response.status() < StatusCode::MULTIPLE_CHOICES { - if response.status() != StatusCode::OK { + if response.status() >= reqwest::StatusCode::OK + && response.status() < reqwest::StatusCode::MULTIPLE_CHOICES + { + if response.status() != reqwest::StatusCode::OK { // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so // log a warning. tracing::warn!( @@ -349,7 +364,7 @@ impl ComputeHook { // Error response codes match response.status() { - StatusCode::TOO_MANY_REQUESTS => { + reqwest::StatusCode::TOO_MANY_REQUESTS => { // TODO: 429 handling should be global: set some state visible to other requests // so that they will delay before starting, rather than all notifications trying // once before backing off. @@ -358,20 +373,30 @@ impl ComputeHook { .ok(); Err(NotifyError::SlowDown) } - StatusCode::LOCKED => { + reqwest::StatusCode::LOCKED => { // We consider this fatal, because it's possible that the operation blocking the control one is // also the one that is waiting for this reconcile. We should let the reconciler calling // this hook fail, to give control plane a chance to un-lock. tracing::info!("Control plane reports tenant is locked, dropping out of notify"); Err(NotifyError::Busy) } - StatusCode::SERVICE_UNAVAILABLE - | StatusCode::GATEWAY_TIMEOUT - | StatusCode::BAD_GATEWAY => Err(NotifyError::Unavailable(response.status())), - StatusCode::BAD_REQUEST | StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => { - Err(NotifyError::Fatal(response.status())) + reqwest::StatusCode::SERVICE_UNAVAILABLE => { + Err(NotifyError::Unavailable(StatusCode::SERVICE_UNAVAILABLE)) } - _ => Err(NotifyError::Unexpected(response.status())), + reqwest::StatusCode::GATEWAY_TIMEOUT => { + Err(NotifyError::Unavailable(StatusCode::GATEWAY_TIMEOUT)) + } + reqwest::StatusCode::BAD_GATEWAY => { + Err(NotifyError::Unavailable(StatusCode::BAD_GATEWAY)) + } + + reqwest::StatusCode::BAD_REQUEST => Err(NotifyError::Fatal(StatusCode::BAD_REQUEST)), + reqwest::StatusCode::UNAUTHORIZED => Err(NotifyError::Fatal(StatusCode::UNAUTHORIZED)), + reqwest::StatusCode::FORBIDDEN => Err(NotifyError::Fatal(StatusCode::FORBIDDEN)), + status => Err(NotifyError::Unexpected( + hyper::StatusCode::from_u16(status.as_u16()) + .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR), + )), } } @@ -381,8 +406,6 @@ impl ComputeHook { reconfigure_request: &ComputeHookNotifyRequest, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let client = reqwest::Client::new(); - // We hold these semaphore units across all retries, rather than only across each // HTTP request: this is to preserve fairness and avoid a situation where a retry might // time out waiting for a semaphore. @@ -394,7 +417,7 @@ impl ComputeHook { .map_err(|_| NotifyError::ShuttingDown)?; backoff::retry( - || self.do_notify_iteration(&client, url, reconfigure_request, cancel), + || self.do_notify_iteration(url, reconfigure_request, cancel), |e| { matches!( e, @@ -411,48 +434,37 @@ impl ComputeHook { .and_then(|x| x) } - /// Call this to notify the compute (postgres) tier of new pageservers to use - /// for a tenant. notify() is called by each shard individually, and this function - /// will decide whether an update to the tenant is sent. An update is sent on the - /// condition that: - /// - We know a pageserver for every shard. - /// - All the shards have the same shard_count (i.e. we are not mid-split) - /// - /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler - /// that is cancelled. - /// - /// This function is fallible, including in the case that the control plane is transiently - /// unavailable. A limited number of retries are done internally to efficiently hide short unavailability - /// periods, but we don't retry forever. The **caller** is responsible for handling failures and - /// ensuring that they eventually call again to ensure that the compute is eventually notified of - /// the proper pageserver nodes for a tenant. - #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))] - pub(super) async fn notify( + /// Synchronous phase: update the per-tenant state for the next intended notification + fn notify_prepare( &self, tenant_shard_id: TenantShardId, node_id: NodeId, stripe_size: ShardStripeSize, + ) -> MaybeSendResult { + let mut state_locked = self.state.lock().unwrap(); + + use std::collections::hash_map::Entry; + let tenant = match state_locked.entry(tenant_shard_id.tenant_id) { + Entry::Vacant(e) => e.insert(ComputeHookTenant::new( + tenant_shard_id, + stripe_size, + node_id, + )), + Entry::Occupied(e) => { + let tenant = e.into_mut(); + tenant.update(tenant_shard_id, stripe_size, node_id); + tenant + } + }; + tenant.maybe_send(tenant_shard_id.tenant_id, None) + } + + async fn notify_execute( + &self, + maybe_send_result: MaybeSendResult, + tenant_shard_id: TenantShardId, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let maybe_send_result = { - let mut state_locked = self.state.lock().unwrap(); - - use std::collections::hash_map::Entry; - let tenant = match state_locked.entry(tenant_shard_id.tenant_id) { - Entry::Vacant(e) => e.insert(ComputeHookTenant::new( - tenant_shard_id, - stripe_size, - node_id, - )), - Entry::Occupied(e) => { - let tenant = e.into_mut(); - tenant.update(tenant_shard_id, stripe_size, node_id); - tenant - } - }; - tenant.maybe_send(tenant_shard_id.tenant_id, None) - }; - // Process result: we may get an update to send, or we may have to wait for a lock // before trying again. let (request, mut send_lock_guard) = match maybe_send_result { @@ -460,7 +472,12 @@ impl ComputeHook { return Ok(()); } MaybeSendResult::AwaitLock(send_lock) => { - let send_locked = send_lock.lock_owned().await; + let send_locked = tokio::select! { + guard = send_lock.lock_owned() => {guard}, + _ = cancel.cancelled() => { + return Err(NotifyError::ShuttingDown) + } + }; // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here // we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses @@ -499,6 +516,94 @@ impl ComputeHook { } result } + + /// Infallible synchronous fire-and-forget version of notify(), that sends its results to + /// a channel. Something should consume the channel and arrange to try notifying again + /// if something failed. + pub(super) fn notify_background( + self: &Arc, + notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>, + result_tx: tokio::sync::mpsc::Sender>, + cancel: &CancellationToken, + ) { + let mut maybe_sends = Vec::new(); + for (tenant_shard_id, node_id, stripe_size) in notifications { + let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size); + maybe_sends.push((tenant_shard_id, maybe_send_result)) + } + + let this = self.clone(); + let cancel = cancel.clone(); + + tokio::task::spawn(async move { + // Construct an async stream of futures to invoke the compute notify function: we do this + // in order to subsequently use .buffered() on the stream to execute with bounded parallelism. The + // ComputeHook semaphore already limits concurrency, but this way we avoid constructing+polling lots of futures which + // would mostly just be waiting on that semaphore. + let mut stream = futures::stream::iter(maybe_sends) + .map(|(tenant_shard_id, maybe_send_result)| { + let this = this.clone(); + let cancel = cancel.clone(); + + async move { + this + .notify_execute(maybe_send_result, tenant_shard_id, &cancel) + .await.map_err(|e| (tenant_shard_id, e)) + }.instrument(info_span!( + "notify_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug() + )) + }) + .buffered(API_CONCURRENCY); + + loop { + tokio::select! { + next = stream.next() => { + match next { + Some(r) => { + result_tx.send(r).await.ok(); + }, + None => { + tracing::info!("Finished sending background compute notifications"); + break; + } + } + }, + _ = cancel.cancelled() => { + tracing::info!("Shutdown while running background compute notifications"); + break; + } + }; + } + }); + } + + /// Call this to notify the compute (postgres) tier of new pageservers to use + /// for a tenant. notify() is called by each shard individually, and this function + /// will decide whether an update to the tenant is sent. An update is sent on the + /// condition that: + /// - We know a pageserver for every shard. + /// - All the shards have the same shard_count (i.e. we are not mid-split) + /// + /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler + /// that is cancelled. + /// + /// This function is fallible, including in the case that the control plane is transiently + /// unavailable. A limited number of retries are done internally to efficiently hide short unavailability + /// periods, but we don't retry forever. The **caller** is responsible for handling failures and + /// ensuring that they eventually call again to ensure that the compute is eventually notified of + /// the proper pageserver nodes for a tenant. + #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))] + pub(super) async fn notify( + &self, + tenant_shard_id: TenantShardId, + node_id: NodeId, + stripe_size: ShardStripeSize, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size); + self.notify_execute(maybe_send_result, tenant_shard_id, cancel) + .await + } } #[cfg(test)] diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index 7669680eb6..1ef97e78eb 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -184,6 +184,19 @@ impl HeartbeaterTask { } } } + tracing::info!( + "Heartbeat round complete for {} nodes, {} offline", + new_state.len(), + new_state + .values() + .filter(|s| match s { + PageserverState::Available { .. } => { + false + } + PageserverState::Offline => true, + }) + .count() + ); let mut deltas = Vec::new(); let now = Instant::now(); diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index c59bcaa174..604ad6fbaa 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -4,10 +4,12 @@ use crate::metrics::{ }; use crate::reconciler::ReconcileError; use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT}; +use anyhow::Context; use futures::Future; use hyper::header::CONTENT_TYPE; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; +use metrics::{BuildInfo, NeonMetrics}; use pageserver_api::models::{ TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, TimelineCreateRequest, @@ -44,15 +46,19 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; use routerify::Middleware; /// State available to HTTP request handlers -#[derive(Clone)] pub struct HttpState { service: Arc, auth: Option>, + neon_metrics: NeonMetrics, allowlist_routes: Vec, } impl HttpState { - pub fn new(service: Arc, auth: Option>) -> Self { + pub fn new( + service: Arc, + auth: Option>, + build_info: BuildInfo, + ) -> Self { let allowlist_routes = ["/status", "/ready", "/metrics"] .iter() .map(|v| v.parse().unwrap()) @@ -60,6 +66,7 @@ impl HttpState { Self { service, auth, + neon_metrics: NeonMetrics::new(build_info), allowlist_routes, } } @@ -252,6 +259,12 @@ async fn handle_tenant_time_travel_remote_storage( json_response(StatusCode::OK, ()) } +fn map_reqwest_hyper_status(status: reqwest::StatusCode) -> Result { + hyper::StatusCode::from_u16(status.as_u16()) + .context("invalid status code") + .map_err(ApiError::InternalServerError) +} + async fn handle_tenant_secondary_download( service: Arc, req: Request, @@ -260,7 +273,7 @@ async fn handle_tenant_secondary_download( let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis); let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?; - json_response(status, progress) + json_response(map_reqwest_hyper_status(status)?, progress) } async fn handle_tenant_delete( @@ -271,7 +284,10 @@ async fn handle_tenant_delete( check_permissions(&req, Scope::PageServerApi)?; deletion_wrapper(service, move |service| async move { - service.tenant_delete(tenant_id).await + service + .tenant_delete(tenant_id) + .await + .and_then(map_reqwest_hyper_status) }) .await } @@ -302,7 +318,10 @@ async fn handle_tenant_timeline_delete( let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; deletion_wrapper(service, move |service| async move { - service.tenant_timeline_delete(tenant_id, timeline_id).await + service + .tenant_timeline_delete(tenant_id, timeline_id) + .await + .and_then(map_reqwest_hyper_status) }) .await } @@ -365,11 +384,9 @@ async fn handle_tenant_timeline_passthrough( } // We have a reqest::Response, would like a http::Response - let mut builder = hyper::Response::builder() - .status(resp.status()) - .version(resp.version()); + let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?); for (k, v) in resp.headers() { - builder = builder.header(k, v); + builder = builder.header(k.as_str(), v.as_bytes()); } let response = builder @@ -516,6 +533,18 @@ async fn handle_tenant_drop(req: Request) -> Result, ApiErr json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?) } +async fn handle_tenant_import(req: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let state = get_state(&req); + + json_response( + StatusCode::OK, + state.service.tenant_import(tenant_id).await?, + ) +} + async fn handle_tenants_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -672,10 +701,11 @@ fn epilogue_metrics_middleware }) } -pub async fn measured_metrics_handler(_req: Request) -> Result, ApiError> { +pub async fn measured_metrics_handler(req: Request) -> Result, ApiError> { pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; - let payload = crate::metrics::METRICS_REGISTRY.encode(); + let state = get_state(&req); + let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics); let response = Response::builder() .status(200) .header(CONTENT_TYPE, TEXT_FORMAT) @@ -704,6 +734,7 @@ where pub fn make_router( service: Arc, auth: Option>, + build_info: BuildInfo, ) -> RouterBuilder { let mut router = endpoint::make_router() .middleware(prologue_metrics_middleware()) @@ -720,7 +751,7 @@ pub fn make_router( } router - .data(Arc::new(HttpState::new(service, auth))) + .data(Arc::new(HttpState::new(service, auth, build_info))) .get("/metrics", |r| { named_request_span(r, measured_metrics_handler, RequestName("metrics")) }) @@ -751,6 +782,13 @@ pub fn make_router( .post("/debug/v1/node/:node_id/drop", |r| { named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop")) }) + .post("/debug/v1/tenant/:tenant_id/import", |r| { + named_request_span( + r, + handle_tenant_import, + RequestName("debug_v1_tenant_import"), + ) + }) .get("/debug/v1/tenant", |r| { named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant")) }) @@ -874,7 +912,7 @@ pub fn make_router( RequestName("v1_tenant_timeline"), ) }) - // Tenant detail GET passthrough to shard zero + // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( r, @@ -882,13 +920,14 @@ pub fn make_router( RequestName("v1_tenant_passthrough"), ) }) - // Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future - // timeline GET APIs will be implicitly included. - .get("/v1/tenant/:tenant_id/timeline*", |r| { + // The `*` in the URL is a wildcard: any tenant/timeline GET APIs on the pageserver + // are implicitly exposed here. This must be last in the list to avoid + // taking precedence over other GET methods we might implement by hand. + .get("/v1/tenant/:tenant_id/*", |r| { tenant_service_handler( r, handle_tenant_timeline_passthrough, - RequestName("v1_tenant_timeline_passthrough"), + RequestName("v1_tenant_passthrough"), ) }) } diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs index b03700b50c..dff793289f 100644 --- a/storage_controller/src/id_lock_map.rs +++ b/storage_controller/src/id_lock_map.rs @@ -1,25 +1,64 @@ +use std::fmt::Display; +use std::time::Instant; use std::{collections::HashMap, sync::Arc}; +use std::time::Duration; + +use crate::service::RECONCILE_TIMEOUT; + +const LOCK_TIMEOUT_ALERT_THRESHOLD: Duration = RECONCILE_TIMEOUT; + +/// A wrapper around `OwnedRwLockWriteGuard` that when dropped changes the +/// current holding operation in lock. +pub struct WrappedWriteGuard { + guard: tokio::sync::OwnedRwLockWriteGuard>, + start: Instant, +} + +impl WrappedWriteGuard { + pub fn new(guard: tokio::sync::OwnedRwLockWriteGuard>) -> Self { + Self { + guard, + start: Instant::now(), + } + } +} + +impl Drop for WrappedWriteGuard { + fn drop(&mut self) { + let duration = self.start.elapsed(); + if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { + tracing::warn!( + "Lock on {} was held for {:?}", + self.guard.as_ref().unwrap(), + duration + ); + } + *self.guard = None; + } +} + /// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't /// want to embed a lock in each one, or if your locking granularity is different to your object granularity. /// For example, used in the storage controller where the objects are tenant shards, but sometimes locking /// is needed at a tenant-wide granularity. -pub(crate) struct IdLockMap +pub(crate) struct IdLockMap where T: Eq + PartialEq + std::hash::Hash, { /// A synchronous lock for getting/setting the async locks that our callers will wait on. - entities: std::sync::Mutex>>>, + entities: std::sync::Mutex>>>>, } -impl IdLockMap +impl IdLockMap where T: Eq + PartialEq + std::hash::Hash, + I: Display, { pub(crate) fn shared( &self, key: T, - ) -> impl std::future::Future> { + ) -> impl std::future::Future>> { let mut locked = self.entities.lock().unwrap(); let entry = locked.entry(key).or_default(); entry.clone().read_owned() @@ -28,21 +67,26 @@ where pub(crate) fn exclusive( &self, key: T, - ) -> impl std::future::Future> { + operation: I, + ) -> impl std::future::Future> { let mut locked = self.entities.lock().unwrap(); - let entry = locked.entry(key).or_default(); - entry.clone().write_owned() + let entry = locked.entry(key).or_default().clone(); + async move { + let mut guard = WrappedWriteGuard::new(entry.clone().write_owned().await); + *guard.guard = Some(operation); + guard + } } /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do /// periodic housekeeping to avoid the map growing indefinitely pub(crate) fn housekeeping(&self) { let mut locked = self.entities.lock().unwrap(); - locked.retain(|_k, lock| lock.try_write().is_err()) + locked.retain(|_k, entry| entry.try_write().is_err()) } } -impl Default for IdLockMap +impl Default for IdLockMap where T: Eq + PartialEq + std::hash::Hash, { @@ -52,3 +96,94 @@ where } } } + +pub async fn trace_exclusive_lock< + T: Clone + Display + Eq + PartialEq + std::hash::Hash, + I: Display + Clone, +>( + op_locks: &IdLockMap, + key: T, + operation: I, +) -> WrappedWriteGuard { + let start = Instant::now(); + let guard = op_locks.exclusive(key.clone(), operation.clone()).await; + + let duration = start.elapsed(); + if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { + tracing::warn!( + "Operation {} on key {} has waited {:?} for exclusive lock", + operation, + key, + duration + ); + } + + guard +} + +pub async fn trace_shared_lock< + T: Clone + Display + Eq + PartialEq + std::hash::Hash, + I: Display, +>( + op_locks: &IdLockMap, + key: T, + operation: I, +) -> tokio::sync::OwnedRwLockReadGuard> { + let start = Instant::now(); + let guard = op_locks.shared(key.clone()).await; + + let duration = start.elapsed(); + if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { + tracing::warn!( + "Operation {} on key {} has waited {:?} for shared lock", + operation, + key, + duration + ); + } + + guard +} + +#[cfg(test)] +mod tests { + use super::IdLockMap; + + #[derive(Clone, Debug, strum_macros::Display, PartialEq)] + enum Operations { + Op1, + Op2, + } + + #[tokio::test] + async fn multiple_shared_locks() { + let id_lock_map: IdLockMap = IdLockMap::default(); + + let shared_lock_1 = id_lock_map.shared(1).await; + let shared_lock_2 = id_lock_map.shared(1).await; + + assert!(shared_lock_1.is_none()); + assert!(shared_lock_2.is_none()); + } + + #[tokio::test] + async fn exclusive_locks() { + let id_lock_map = IdLockMap::default(); + let resource_id = 1; + + { + let _ex_lock = id_lock_map.exclusive(resource_id, Operations::Op1).await; + assert_eq!(_ex_lock.guard.clone().unwrap(), Operations::Op1); + + let _ex_lock_2 = tokio::time::timeout( + tokio::time::Duration::from_millis(1), + id_lock_map.exclusive(resource_id, Operations::Op2), + ) + .await; + assert!(_ex_lock_2.is_err()); + } + + let shared_lock_1 = id_lock_map.shared(resource_id).await; + assert!(shared_lock_1.is_none()); + } +} diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 3c03d6efe8..f1454af533 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -3,11 +3,14 @@ use camino::Utf8PathBuf; use clap::Parser; use diesel::Connection; use metrics::launch_timestamp::LaunchTimestamp; +use metrics::BuildInfo; use std::sync::Arc; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; -use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT}; +use storage_controller::service::{ + Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, +}; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; use utils::auth::{JwtAuth, SwappableJwtAuth}; @@ -62,6 +65,14 @@ struct Cli { /// Grace period before marking unresponsive pageserver offline #[arg(long)] max_unavailable_interval: Option, + + /// Maximum number of reconcilers that may run in parallel + #[arg(long)] + reconciler_concurrency: Option, + + /// How long to wait for the initial database connection to be available. + #[arg(long, default_value = "5s")] + db_connect_timeout: humantime::Duration, } enum StrictMode { @@ -192,6 +203,11 @@ async fn async_main() -> anyhow::Result<()> { args.listen ); + let build_info = BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }; + let strict_mode = if args.dev { StrictMode::Dev } else { @@ -236,9 +252,14 @@ async fn async_main() -> anyhow::Result<()> { .max_unavailable_interval .map(humantime::Duration::into) .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT), + reconciler_concurrency: args + .reconciler_concurrency + .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), }; // After loading secrets & config, but before starting anything else, apply database migrations + Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?; + migration_run(&secrets.database_url) .await .context("Running database migrations")?; @@ -253,7 +274,7 @@ async fn async_main() -> anyhow::Result<()> { let auth = secrets .public_key .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth))); - let router = make_router(service.clone(), auth) + let router = make_router(service.clone(), auth, build_info) .build() .map_err(|err| anyhow!(err))?; let router_service = utils::http::RouterService::new(router).unwrap(); diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index cabf416b9f..ac9f22c739 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -8,10 +8,8 @@ //! The rest of the code defines label group types and deals with converting outer types to labels. //! use bytes::Bytes; -use measured::{ - label::{LabelValue, StaticLabelSet}, - FixedCardinalityLabel, MetricGroup, -}; +use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup}; +use metrics::NeonMetrics; use once_cell::sync::Lazy; use std::sync::Mutex; @@ -26,13 +24,15 @@ pub fn preinitialize_metrics() { pub(crate) struct StorageControllerMetrics { pub(crate) metrics_group: StorageControllerMetricGroup, - encoder: Mutex, + encoder: Mutex, } #[derive(measured::MetricGroup)] +#[metric(new())] pub(crate) struct StorageControllerMetricGroup { /// Count of how many times we spawn a reconcile task pub(crate) storage_controller_reconcile_spawn: measured::Counter, + /// Reconciler tasks completed, broken down by success/failure/cancelled pub(crate) storage_controller_reconcile_complete: measured::CounterVec, @@ -43,7 +43,9 @@ pub(crate) struct StorageControllerMetricGroup { /// HTTP request status counters for handled requests pub(crate) storage_controller_http_request_status: measured::CounterVec, + /// HTTP request handler latency across all status codes + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_http_request_latency: measured::HistogramVec, @@ -55,6 +57,7 @@ pub(crate) struct StorageControllerMetricGroup { /// Latency of HTTP requests to the pageserver, broken down by pageserver /// node id, request name and method. This include both successful and unsuccessful /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_pageserver_request_latency: measured::HistogramVec, @@ -66,6 +69,7 @@ pub(crate) struct StorageControllerMetricGroup { /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver /// node id, request name and method. This include both successful and unsuccessful /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_passthrough_request_latency: measured::HistogramVec, @@ -74,76 +78,34 @@ pub(crate) struct StorageControllerMetricGroup { measured::CounterVec, /// Latency of database queries, broken down by operation. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_database_query_latency: measured::HistogramVec, } impl StorageControllerMetrics { - pub(crate) fn encode(&self) -> Bytes { + pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes { let mut encoder = self.encoder.lock().unwrap(); - self.metrics_group.collect_into(&mut *encoder); + neon_metrics + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + self.metrics_group + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); encoder.finish() } } impl Default for StorageControllerMetrics { fn default() -> Self { - Self { - metrics_group: StorageControllerMetricGroup::new(), - encoder: Mutex::new(measured::text::TextEncoder::new()), - } - } -} + let mut metrics_group = StorageControllerMetricGroup::new(); + metrics_group + .storage_controller_reconcile_complete + .init_all_dense(); -impl StorageControllerMetricGroup { - pub(crate) fn new() -> Self { Self { - storage_controller_reconcile_spawn: measured::Counter::new(), - storage_controller_reconcile_complete: measured::CounterVec::new( - ReconcileCompleteLabelGroupSet { - status: StaticLabelSet::new(), - }, - ), - storage_controller_schedule_optimization: measured::Counter::new(), - storage_controller_http_request_status: measured::CounterVec::new( - HttpRequestStatusLabelGroupSet { - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - status: StaticLabelSet::new(), - }, - ), - storage_controller_http_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_pageserver_request_error: measured::CounterVec::new( - PageserverRequestLabelGroupSet { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - }, - ), - storage_controller_pageserver_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_passthrough_request_error: measured::CounterVec::new( - PageserverRequestLabelGroupSet { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - }, - ), - storage_controller_passthrough_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_database_query_error: measured::CounterVec::new( - DatabaseQueryErrorLabelGroupSet { - operation: StaticLabelSet::new(), - error_type: StaticLabelSet::new(), - }, - ), - storage_controller_database_query_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), + metrics_group, + encoder: Mutex::new(measured::text::BufferedTextEncoder::new()), } } } @@ -157,7 +119,7 @@ pub(crate) struct ReconcileCompleteLabelGroup { #[derive(measured::LabelGroup)] #[label(set = HttpRequestStatusLabelGroupSet)] pub(crate) struct HttpRequestStatusLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, pub(crate) status: StatusCode, @@ -166,40 +128,21 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> { #[derive(measured::LabelGroup)] #[label(set = HttpRequestLatencyLabelGroupSet)] pub(crate) struct HttpRequestLatencyLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, } -impl Default for HttpRequestLatencyLabelGroupSet { - fn default() -> Self { - Self { - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - } - } -} - #[derive(measured::LabelGroup, Clone)] #[label(set = PageserverRequestLabelGroupSet)] pub(crate) struct PageserverRequestLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) pageserver_id: &'a str, - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, } -impl Default for PageserverRequestLabelGroupSet { - fn default() -> Self { - Self { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - } - } -} - #[derive(measured::LabelGroup)] #[label(set = DatabaseQueryErrorLabelGroupSet)] pub(crate) struct DatabaseQueryErrorLabelGroup { @@ -213,7 +156,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup { pub(crate) operation: DatabaseOperation, } -#[derive(FixedCardinalityLabel)] +#[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum ReconcileOutcome { #[label(rename = "ok")] Success, @@ -221,7 +164,7 @@ pub(crate) enum ReconcileOutcome { Cancel, } -#[derive(FixedCardinalityLabel, Clone)] +#[derive(FixedCardinalityLabel, Copy, Clone)] pub(crate) enum Method { Get, Put, @@ -246,11 +189,12 @@ impl From for Method { } } +#[derive(Clone, Copy)] pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode); impl LabelValue for StatusCode { fn visit(&self, v: V) -> V::Output { - v.write_int(self.0.as_u16() as u64) + v.write_int(self.0.as_u16() as i64) } } @@ -268,7 +212,7 @@ impl FixedCardinalityLabel for StatusCode { } } -#[derive(FixedCardinalityLabel)] +#[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum DatabaseErrorLabel { Query, Connection, diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 7ba6828deb..7b5513c908 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -1,6 +1,5 @@ use std::{str::FromStr, time::Duration}; -use hyper::StatusCode; use pageserver_api::{ controller_api::{ NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, @@ -9,6 +8,7 @@ use pageserver_api::{ shard::TenantShardId, }; use pageserver_client::mgmt_api; +use reqwest::StatusCode; use serde::Serialize; use tokio_util::sync::CancellationToken; use utils::{backoff, id::NodeId}; diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 8237229d7b..25b6b67e12 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,13 +1,14 @@ use pageserver_api::{ models::{ LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, - TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, + TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, + TimelineCreateRequest, TimelineInfo, }, shard::TenantShardId, }; use pageserver_client::mgmt_api::{Client, Result}; use reqwest::StatusCode; -use utils::id::{NodeId, TimelineId}; +use utils::id::{NodeId, TenantId, TimelineId}; /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage /// controller to collect metrics in a non-intrusive manner. @@ -88,6 +89,18 @@ impl PageserverClient { ) } + pub(crate) async fn tenant_scan_remote_storage( + &self, + tenant_id: TenantId, + ) -> Result { + measured_request!( + "tenant_scan_remote_storage", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.tenant_scan_remote_storage(tenant_id).await + ) + } + pub(crate) async fn tenant_secondary_download( &self, tenant_id: TenantShardId, @@ -101,6 +114,27 @@ impl PageserverClient { ) } + pub(crate) async fn tenant_secondary_status( + &self, + tenant_shard_id: TenantShardId, + ) -> Result { + measured_request!( + "tenant_secondary_status", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.tenant_secondary_status(tenant_shard_id).await + ) + } + + pub(crate) async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> { + measured_request!( + "tenant_heatmap_upload", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.tenant_heatmap_upload(tenant_id).await + ) + } + pub(crate) async fn location_config( &self, tenant_shard_id: TenantShardId, diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 55fbfd10bc..dca37166ba 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -2,6 +2,7 @@ pub(crate) mod split_state; use std::collections::HashMap; use std::str::FromStr; use std::time::Duration; +use std::time::Instant; use self::split_state::SplitState; use camino::Utf8Path; @@ -79,7 +80,7 @@ pub(crate) enum DatabaseError { Logical(String), } -#[derive(measured::FixedCardinalityLabel, Clone)] +#[derive(measured::FixedCardinalityLabel, Copy, Clone)] pub(crate) enum DatabaseOperation { InsertNode, UpdateNode, @@ -144,6 +145,31 @@ impl Persistence { } } + /// A helper for use during startup, where we would like to tolerate concurrent restarts of the + /// database and the storage controller, therefore the database might not be available right away + pub async fn await_connection( + database_url: &str, + timeout: Duration, + ) -> Result<(), diesel::ConnectionError> { + let started_at = Instant::now(); + loop { + match PgConnection::establish(database_url) { + Ok(_) => { + tracing::info!("Connected to database."); + return Ok(()); + } + Err(e) => { + if started_at.elapsed() > timeout { + return Err(e); + } else { + tracing::info!("Database not yet available, waiting... ({e})"); + tokio::time::sleep(Duration::from_millis(100)).await; + } + } + } + } + } + /// Wraps `with_conn` in order to collect latency and error metrics async fn with_measured_conn(&self, op: DatabaseOperation, func: F) -> DatabaseResult where @@ -153,9 +179,7 @@ impl Persistence { let latency = &METRICS_REGISTRY .metrics_group .storage_controller_database_query_latency; - let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { - operation: op.clone(), - }); + let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op }); let res = self.with_conn(func).await; diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 49cfaad569..fe97f724c1 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,12 +1,12 @@ use crate::pageserver_client::PageserverClient; use crate::persistence::Persistence; use crate::service; -use hyper::StatusCode; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, }; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_client::mgmt_api; +use reqwest::StatusCode; use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -51,6 +51,10 @@ pub(super) struct Reconciler { /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry. pub(crate) compute_notify_failure: bool, + /// Reconciler is responsible for keeping alive semaphore units that limit concurrency on how many + /// we will spawn. + pub(crate) _resource_units: ReconcileUnits, + /// A means to abort background reconciliation: it is essential to /// call this when something changes in the original TenantShard that /// will make this reconciliation impossible or unnecessary, for @@ -66,6 +70,19 @@ pub(super) struct Reconciler { pub(crate) persistence: Arc, } +/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O +pub(crate) struct ReconcileUnits { + _sem_units: tokio::sync::OwnedSemaphorePermit, +} + +impl ReconcileUnits { + pub(crate) fn new(sem_units: tokio::sync::OwnedSemaphorePermit) -> Self { + Self { + _sem_units: sem_units, + } + } +} + /// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any /// reference counting for Scheduler. The IntentState is what the scheduler works with, /// and the TargetState is just the instruction for a particular Reconciler run. @@ -750,7 +767,10 @@ impl Reconciler { // It is up to the caller whether they want to drop out on this error, but they don't have to: // in general we should avoid letting unavailability of the cloud control plane stop us from // making progress. - tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}"); + if !matches!(e, NotifyError::ShuttingDown) { + tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}"); + } + // Set this flag so that in our ReconcileResult we will set the flag on the shard that it // needs to retry at some point. self.compute_notify_failure = true; diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 862ac0cbfe..3ff0d87988 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -84,6 +84,20 @@ impl std::ops::Add for AffinityScore { } } +/// Hint for whether this is a sincere attempt to schedule, or a speculative +/// check for where we _would_ schedule (done during optimization) +#[derive(Debug)] +pub(crate) enum ScheduleMode { + Normal, + Speculative, +} + +impl Default for ScheduleMode { + fn default() -> Self { + Self::Normal + } +} + // For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling // it for many shards in the same tenant. #[derive(Debug, Default)] @@ -93,6 +107,8 @@ pub(crate) struct ScheduleContext { /// Specifically how many _attached_ locations are on each node pub(crate) attached_nodes: HashMap, + + pub(crate) mode: ScheduleMode, } impl ScheduleContext { @@ -329,27 +345,34 @@ impl Scheduler { scores.sort_by_key(|i| (i.1, i.2, i.0)); if scores.is_empty() { - // After applying constraints, no pageservers were left. We log some detail about - // the state of nodes to help understand why this happened. This is not logged as an error because - // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard. - tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:"); - for (node_id, node) in &self.nodes { + // After applying constraints, no pageservers were left. + if !matches!(context.mode, ScheduleMode::Speculative) { + // If this was not a speculative attempt, log details to understand why we couldn't + // schedule: this may help an engineer understand if some nodes are marked offline + // in a way that's preventing progress. tracing::info!( - "Node {node_id}: may_schedule={} shards={}", - node.may_schedule != MaySchedule::No, - node.shard_count + "Scheduling failure, while excluding {hard_exclude:?}, node states:" ); + for (node_id, node) in &self.nodes { + tracing::info!( + "Node {node_id}: may_schedule={} shards={}", + node.may_schedule != MaySchedule::No, + node.shard_count + ); + } } - return Err(ScheduleError::ImpossibleConstraint); } // Lowest score wins let node_id = scores.first().unwrap().0; - tracing::info!( + + if !matches!(context.mode, ScheduleMode::Speculative) { + tracing::info!( "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})", scores.iter().map(|i| i.0 .0).collect::>() ); + } // Note that we do not update shard count here to reflect the scheduling: that // is IntentState's job when the scheduled location is used. diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 010558b797..d3a53066c9 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -8,10 +8,14 @@ use std::{ }; use crate::{ - id_lock_map::IdLockMap, + compute_hook::NotifyError, + id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard}, persistence::{AbortShardSplitStatus, TenantFilter}, - reconciler::ReconcileError, - scheduler::ScheduleContext, + reconciler::{ReconcileError, ReconcileUnits}, + scheduler::{ScheduleContext, ScheduleMode}, + tenant_shard::{ + MigrateAttachment, ReconcileNeeded, ScheduleOptimization, ScheduleOptimizationAction, + }, }; use anyhow::Context; use control_plane::storage_controller::{ @@ -19,7 +23,6 @@ use control_plane::storage_controller::{ }; use diesel::result::DatabaseErrorKind; use futures::{stream::FuturesUnordered, StreamExt}; -use hyper::StatusCode; use itertools::Itertools; use pageserver_api::{ controller_api::{ @@ -31,6 +34,8 @@ use pageserver_api::{ }, models::{SecondaryProgress, TenantConfigRequest}, }; +use reqwest::StatusCode; +use tracing::instrument; use crate::pageserver_client::PageserverClient; use pageserver_api::{ @@ -48,11 +53,11 @@ use pageserver_api::{ }, }; use pageserver_client::mgmt_api; -use tokio::sync::OwnedRwLockWriteGuard; +use tokio::sync::mpsc::error::TrySendError; use tokio_util::sync::CancellationToken; -use tracing::instrument; use utils::{ completion::Barrier, + failpoint_support, generation::Generation, http::error::ApiError, id::{NodeId, TenantId, TimelineId}, @@ -60,7 +65,7 @@ use utils::{ }; use crate::{ - compute_hook::{self, ComputeHook}, + compute_hook::ComputeHook, heartbeater::{Heartbeater, PageserverState}, node::{AvailabilityTransition, Node}, persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence}, @@ -77,7 +82,7 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5); // For operations that might be slow, like migrating a tenant with // some data in it. -const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); +pub const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); // If we receive a call using Secondary mode initially, it will omit generation. We will initialize // tenant shards into this generation, and as long as it remains in this generation, we will accept @@ -88,7 +93,38 @@ const INITIAL_GENERATION: Generation = Generation::new(0); /// up on unresponsive pageservers and proceed. pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); -pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30); +/// How long a node may be unresponsive to heartbeats before we declare it offline. +/// This must be long enough to cover node restarts as well as normal operations: in future +/// it should be separated into distinct timeouts for startup vs. normal operation +/// (``) +pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); + +#[derive(Clone, strum_macros::Display)] +enum TenantOperations { + Create, + LocationConfig, + ConfigSet, + TimeTravelRemoteStorage, + Delete, + UpdatePolicy, + ShardSplit, + SecondaryDownload, + TimelineCreate, + TimelineDelete, +} + +#[derive(Clone, strum_macros::Display)] +enum NodeOperations { + Register, + Configure, +} + +pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; + +// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. +// This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly +// than they're being pushed onto the queue. +const MAX_DELAYED_RECONCILES: usize = 10000; // Top level state available to all HTTP handlers struct ServiceState { @@ -97,6 +133,45 @@ struct ServiceState { nodes: Arc>, scheduler: Scheduler, + + /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile + delayed_reconcile_rx: tokio::sync::mpsc::Receiver, +} + +/// Transform an error from a pageserver into an error to return to callers of a storage +/// controller API. +fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { + match e { + mgmt_api::Error::ReceiveErrorBody(str) => { + // Presume errors receiving body are connectivity/availability issues + ApiError::ResourceUnavailable( + format!("{node} error receiving error body: {str}").into(), + ) + } + mgmt_api::Error::ReceiveBody(str) => { + // Presume errors receiving body are connectivity/availability issues + ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into()) + } + mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => { + ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into()) + } + mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg) => { + ApiError::ResourceUnavailable(format!("{node}: {msg}").into()) + } + mgmt_api::Error::ApiError(status @ StatusCode::UNAUTHORIZED, msg) + | mgmt_api::Error::ApiError(status @ StatusCode::FORBIDDEN, msg) => { + // Auth errors talking to a pageserver are not auth errors for the caller: they are + // internal server errors, showing that something is wrong with the pageserver or + // storage controller's auth configuration. + ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}")) + } + mgmt_api::Error::ApiError(status, msg) => { + // Presume general case of pageserver API errors is that we tried to do something + // that can't be done right now. + ApiError::Conflict(format!("{node} {status}: {status} {msg}")) + } + mgmt_api::Error::Cancelled => ApiError::ShuttingDown, + } } impl ServiceState { @@ -104,11 +179,13 @@ impl ServiceState { nodes: HashMap, tenants: BTreeMap, scheduler: Scheduler, + delayed_reconcile_rx: tokio::sync::mpsc::Receiver, ) -> Self { Self { tenants, nodes: Arc::new(nodes), scheduler, + delayed_reconcile_rx, } } @@ -142,6 +219,9 @@ pub struct Config { /// considered active. Once the grace period elapses, the next heartbeat failure will /// mark the pagseserver offline. pub max_unavailable_interval: Duration, + + /// How many Reconcilers may be spawned concurrently + pub reconciler_concurrency: usize, } impl From for ApiError { @@ -174,11 +254,22 @@ pub struct Service { // Locking on a tenant granularity (covers all shards in the tenant): // - Take exclusively for rare operations that mutate the tenant's persistent state (e.g. create/delete/split) // - Take in shared mode for operations that need the set of shards to stay the same to complete reliably (e.g. timeline CRUD) - tenant_op_locks: IdLockMap, + tenant_op_locks: IdLockMap, // Locking for node-mutating operations: take exclusively for operations that modify the node's persistent state, or // that transition it to/from Active. - node_op_locks: IdLockMap, + node_op_locks: IdLockMap, + + // Limit how many Reconcilers we will spawn concurrently + reconciler_concurrency: Arc, + + /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile + /// Send into this queue to promptly attempt to reconcile this shard next time units are available. + /// + /// Note that this state logically lives inside ServiceInner, but carrying Sender here makes the code simpler + /// by avoiding needing a &mut ref to something inside the ServiceInner. This could be optimized to + /// use a VecDeque instead of a channel to reduce synchronization overhead, at the cost of some code complexity. + delayed_reconcile_tx: tokio::sync::mpsc::Sender, // Process shutdown will fire this token cancel: CancellationToken, @@ -239,7 +330,7 @@ struct TenantShardSplitAbort { new_shard_count: ShardCount, new_stripe_size: Option, /// Until this abort op is complete, no other operations may be done on the tenant - _tenant_lock: tokio::sync::OwnedRwLockWriteGuard<()>, + _tenant_lock: WrappedWriteGuard, } #[derive(thiserror::Error, Debug)] @@ -269,7 +360,12 @@ impl Service { /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date /// view of the world, and determine which pageservers are responsive. #[instrument(skip_all)] - async fn startup_reconcile(self: &Arc) { + async fn startup_reconcile( + self: &Arc, + bg_compute_notify_result_tx: tokio::sync::mpsc::Sender< + Result<(), (TenantShardId, NotifyError)>, + >, + ) { // For all tenant shards, a vector of observed states on nodes (where None means // indeterminate, same as in [`ObservedStateLocation`]) let mut observed: HashMap)>> = @@ -288,10 +384,6 @@ impl Service { .checked_add(STARTUP_RECONCILE_TIMEOUT / 2) .expect("Reconcile timeout is a modest constant"); - let compute_notify_deadline = start_at - .checked_add((STARTUP_RECONCILE_TIMEOUT / 4) * 3) - .expect("Reconcile timeout is a modest constant"); - // Accumulate a list of any tenant locations that ought to be detached let mut cleanup = Vec::new(); @@ -317,6 +409,7 @@ impl Service { let mut compute_notifications = Vec::new(); // Populate intent and observed states for all tenants, based on reported state on pageservers + tracing::info!("Populating tenant shards' states from initial pageserver scan..."); let shard_count = { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); @@ -383,28 +476,27 @@ impl Service { // Emit compute hook notifications for all tenants which are already stably attached. Other tenants // will emit compute hook notifications when they reconcile. // - // Ordering: we must complete these notification attempts before doing any other reconciliation for the - // tenants named here, because otherwise our calls to notify() might race with more recent values - // generated by reconciliation. - let notify_failures = self - .compute_notify_many(compute_notifications, compute_notify_deadline) - .await; - - // Compute notify is fallible. If it fails here, do not delay overall startup: set the - // flag on these shards that they have a pending notification. - // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later. - { - let mut locked = self.inner.write().unwrap(); - for tenant_shard_id in notify_failures.into_iter() { - if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) { - shard.pending_compute_notification = true; - } - } - } + // Ordering: our calls to notify_background synchronously establish a relative order for these notifications vs. any later + // calls into the ComputeHook for the same tenant: we can leave these to run to completion in the background and any later + // calls will be correctly ordered wrt these. + // + // Concurrency: we call notify_background for all tenants, which will create O(N) tokio tasks, but almost all of them + // will just wait on the ComputeHook::API_CONCURRENCY semaphore immediately, so very cheap until they get that semaphore + // unit and start doing I/O. + tracing::info!( + "Sending {} compute notifications", + compute_notifications.len() + ); + self.compute_hook.notify_background( + compute_notifications, + bg_compute_notify_result_tx.clone(), + &self.cancel, + ); // Finally, now that the service is up and running, launch reconcile operations for any tenants // which require it: under normal circumstances this should only include tenants that were in some // transient state before we restarted, or any tenants whose compute hooks failed above. + tracing::info!("Checking for shards in need of reconciliation..."); let reconcile_tasks = self.reconcile_all(); // We will not wait for these reconciliation tasks to run here: we're now done with startup and // normal operations may proceed. @@ -445,6 +537,7 @@ impl Service { } } + tracing::info!("Sending initial heartbeats..."); let res = self .heartbeater .heartbeat(Arc::new(nodes_to_heartbeat)) @@ -481,6 +574,7 @@ impl Service { let mut node_list_futs = FuturesUnordered::new(); + tracing::info!("Scanning shards on {} nodes...", nodes.len()); for node in nodes.values() { node_list_futs.push({ async move { @@ -600,72 +694,6 @@ impl Service { } } - /// Used during [`Self::startup_reconcile`]: issue many concurrent compute notifications. - /// - /// Returns a set of any shards for which notifications where not acked within the deadline. - async fn compute_notify_many( - &self, - notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>, - deadline: Instant, - ) -> HashSet { - let attempt_shards = notifications.iter().map(|i| i.0).collect::>(); - let mut success_shards = HashSet::new(); - - // Construct an async stream of futures to invoke the compute notify function: we do this - // in order to subsequently use .buffered() on the stream to execute with bounded parallelism. - let mut stream = futures::stream::iter(notifications.into_iter()) - .map(|(tenant_shard_id, node_id, stripe_size)| { - let compute_hook = self.compute_hook.clone(); - let cancel = self.cancel.clone(); - async move { - if let Err(e) = compute_hook - .notify(tenant_shard_id, node_id, stripe_size, &cancel) - .await - { - tracing::error!( - %tenant_shard_id, - %node_id, - "Failed to notify compute on startup for shard: {e}" - ); - None - } else { - Some(tenant_shard_id) - } - } - }) - .buffered(compute_hook::API_CONCURRENCY); - - loop { - tokio::select! { - next = stream.next() => { - match next { - Some(Some(success_shard)) => { - // A notification succeeded - success_shards.insert(success_shard); - }, - Some(None) => { - // A notification that failed - }, - None => { - tracing::info!("Successfully sent all compute notifications"); - break; - } - } - }, - _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { - // Give up sending any that didn't succeed yet - tracing::info!("Reached deadline while sending compute notifications"); - break; - } - }; - } - - attempt_shards - .difference(&success_shards) - .cloned() - .collect() - } - /// Long running background task that periodically wakes up and looks for shards that need /// reconciliation. Reconciliation is fallible, so any reconciliation tasks that fail during /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible @@ -683,7 +711,7 @@ impl Service { let reconciles_spawned = self.reconcile_all(); if reconciles_spawned == 0 { // Run optimizer only when we didn't find any other work to do - self.optimize_all(); + self.optimize_all().await; } } _ = self.cancel.cancelled() => return @@ -742,8 +770,9 @@ impl Service { } /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation - /// was successful, this will update the observed state of the tenant such that subsequent - /// calls to [`TenantShard::maybe_reconcile`] will do nothing. + /// was successful and intent hasn't changed since the Reconciler was spawned, this will update + /// the observed state of the tenant such that subsequent calls to [`TenantShard::get_reconcile_needed`] + /// will indicate that reconciliation is not needed. #[instrument(skip_all, fields( tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), sequence=%result.sequence @@ -796,36 +825,72 @@ impl Service { // Ordering: populate last_error before advancing error_seq, // so that waiters will see the correct error after waiting. - *(tenant.last_error.lock().unwrap()) = format!("{e}"); - tenant.error_waiter.advance(result.sequence); + tenant.set_last_error(result.sequence, e); for (node_id, o) in result.observed.locations { tenant.observed.locations.insert(node_id, o); } } } + + // Maybe some other work can proceed now that this job finished. + if self.reconciler_concurrency.available_permits() > 0 { + while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() { + let (nodes, tenants, _scheduler) = locked.parts_mut(); + if let Some(shard) = tenants.get_mut(&tenant_shard_id) { + shard.delayed_reconcile = false; + self.maybe_reconcile_shard(shard, nodes); + } + + if self.reconciler_concurrency.available_permits() == 0 { + break; + } + } + } } async fn process_results( &self, mut result_rx: tokio::sync::mpsc::UnboundedReceiver, + mut bg_compute_hook_result_rx: tokio::sync::mpsc::Receiver< + Result<(), (TenantShardId, NotifyError)>, + >, ) { loop { // Wait for the next result, or for cancellation - let result = tokio::select! { + tokio::select! { r = result_rx.recv() => { match r { - Some(result) => {result}, + Some(result) => {self.process_result(result);}, None => {break;} } } + _ = async{ + match bg_compute_hook_result_rx.recv().await { + Some(result) => { + if let Err((tenant_shard_id, notify_error)) = result { + tracing::warn!("Marking shard {tenant_shard_id} for notification retry, due to error {notify_error}"); + let mut locked = self.inner.write().unwrap(); + if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) { + shard.pending_compute_notification = true; + } + + } + }, + None => { + // This channel is dead, but we don't want to terminate the outer loop{}: just wait for shutdown + self.cancel.cancelled().await; + } + } + } => {}, _ = self.cancel.cancelled() => { break; } }; - - self.process_result(result); } + + // We should only fall through on shutdown + assert!(self.cancel.is_cancelled()); } async fn process_aborts( @@ -986,6 +1051,13 @@ impl Service { let (startup_completion, startup_complete) = utils::completion::channel(); + // This channel is continuously consumed by process_results, so doesn't need to be very large. + let (bg_compute_notify_result_tx, bg_compute_notify_result_rx) = + tokio::sync::mpsc::channel(512); + + let (delayed_reconcile_tx, delayed_reconcile_rx) = + tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES); + let cancel = CancellationToken::new(); let heartbeater = Heartbeater::new( config.jwt_token.clone(), @@ -994,13 +1066,20 @@ impl Service { ); let this = Arc::new(Self { inner: Arc::new(std::sync::RwLock::new(ServiceState::new( - nodes, tenants, scheduler, + nodes, + tenants, + scheduler, + delayed_reconcile_rx, ))), config: config.clone(), persistence, - compute_hook: Arc::new(ComputeHook::new(config)), + compute_hook: Arc::new(ComputeHook::new(config.clone())), result_tx, heartbeater, + reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new( + config.reconciler_concurrency, + )), + delayed_reconcile_tx, abort_tx, startup_complete: startup_complete.clone(), cancel, @@ -1013,7 +1092,9 @@ impl Service { tokio::task::spawn(async move { // Block shutdown until we're done (we must respect self.cancel) if let Ok(_gate) = result_task_this.gate.enter() { - result_task_this.process_results(result_rx).await + result_task_this + .process_results(result_rx, bg_compute_notify_result_rx) + .await } }); @@ -1055,7 +1136,7 @@ impl Service { return; }; - this.startup_reconcile().await; + this.startup_reconcile(bg_compute_notify_result_tx).await; drop(startup_completion); } }); @@ -1282,7 +1363,7 @@ impl Service { async fn node_activate_reconcile( &self, mut node: Node, - _lock: &OwnedRwLockWriteGuard<()>, + _lock: &WrappedWriteGuard, ) -> Result<(), ApiError> { // This Node is a mutable local copy: we will set it active so that we can use its // API client to reconcile with the node. The Node in [`Self::nodes`] will get updated @@ -1528,14 +1609,15 @@ impl Service { let tenant_id = create_req.new_tenant_id.tenant_id; // Exclude any concurrent attempts to create/access the same tenant ID - let _tenant_lock = self - .tenant_op_locks - .exclusive(create_req.new_tenant_id.tenant_id) - .await; - + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + create_req.new_tenant_id.tenant_id, + TenantOperations::Create, + ) + .await; let (response, waiters) = self.do_tenant_create(create_req).await?; - if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await { + if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await { // Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to // accept compute notifications while it is in the process of creating. Reconciliation will // be retried in the background. @@ -1871,10 +1953,12 @@ impl Service { req: TenantLocationConfigRequest, ) -> Result { // We require an exclusive lock, because we are updating both persistent and in-memory state - let _tenant_lock = self - .tenant_op_locks - .exclusive(tenant_shard_id.tenant_id) - .await; + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_shard_id.tenant_id, + TenantOperations::LocationConfig, + ) + .await; if !tenant_shard_id.is_unsharded() { return Err(ApiError::BadRequest(anyhow::anyhow!( @@ -1992,7 +2076,12 @@ impl Service { pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> { // We require an exclusive lock, because we are updating persistent and in-memory state - let _tenant_lock = self.tenant_op_locks.exclusive(req.tenant_id).await; + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + req.tenant_id, + TenantOperations::ConfigSet, + ) + .await; let tenant_id = req.tenant_id; let config = req.config; @@ -2081,7 +2170,12 @@ impl Service { timestamp: Cow<'_, str>, done_if_after: Cow<'_, str>, ) -> Result<(), ApiError> { - let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await; + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimeTravelRemoteStorage, + ) + .await; let node = { let locked = self.inner.read().unwrap(); @@ -2172,7 +2266,12 @@ impl Service { tenant_id: TenantId, wait: Option, ) -> Result<(StatusCode, SecondaryProgress), ApiError> { - let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await; + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::SecondaryDownload, + ) + .await; // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to let targets = { @@ -2266,7 +2365,8 @@ impl Service { } pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result { - let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await; + let _tenant_lock = + trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await; self.ensure_attached_wait(tenant_id).await?; @@ -2366,7 +2466,14 @@ impl Service { req: TenantPolicyRequest, ) -> Result<(), ApiError> { // We require an exclusive lock, because we are updating persistent and in-memory state - let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await; + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::UpdatePolicy, + ) + .await; + + failpoint_support::sleep_millis_async!("tenant-update-policy-exclusive-lock"); let TenantPolicyRequest { placement, @@ -2420,7 +2527,12 @@ impl Service { create_req.new_timeline_id, ); - let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await; + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineCreate, + ) + .await; self.ensure_attached_wait(tenant_id).await?; @@ -2467,17 +2579,7 @@ impl Service { client .timeline_create(tenant_shard_id, &create_req) .await - .map_err(|e| match e { - mgmt_api::Error::ApiError(status, msg) - if status == StatusCode::INTERNAL_SERVER_ERROR - || status == StatusCode::NOT_ACCEPTABLE => - { - // TODO: handle more error codes, e.g. 503 should be passed through. Make a general wrapper - // for pass-through API calls. - ApiError::InternalServerError(anyhow::anyhow!(msg)) - } - _ => ApiError::Conflict(format!("Failed to create timeline: {e}")), - }) + .map_err(|e| passthrough_api_error(&node, e)) } // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then @@ -2539,13 +2641,57 @@ impl Service { Ok(results) } + /// Concurrently invoke a pageserver API call on many shards at once + pub(crate) async fn tenant_for_shards_api( + &self, + locations: Vec<(TenantShardId, Node)>, + op: O, + warn_threshold: u32, + max_retries: u32, + timeout: Duration, + cancel: &CancellationToken, + ) -> Vec> + where + O: Fn(TenantShardId, PageserverClient) -> F + Copy, + F: std::future::Future>, + { + let mut futs = FuturesUnordered::new(); + let mut results = Vec::with_capacity(locations.len()); + + for (tenant_shard_id, node) in locations { + futs.push(async move { + node.with_client_retries( + |client| op(tenant_shard_id, client), + &self.config.jwt_token, + warn_threshold, + max_retries, + timeout, + cancel, + ) + .await + }); + } + + while let Some(r) = futs.next().await { + let r = r.unwrap_or(Err(mgmt_api::Error::Cancelled)); + results.push(r); + } + + results + } + pub(crate) async fn tenant_timeline_delete( &self, tenant_id: TenantId, timeline_id: TimelineId, ) -> Result { tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,); - let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await; + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineDelete, + ) + .await; self.ensure_attached_wait(tenant_id).await?; @@ -2744,7 +2890,7 @@ impl Service { let mut describe_shards = Vec::new(); for shard in shards { - if shard.tenant_shard_id.is_zero() { + if shard.tenant_shard_id.is_shard_zero() { shard_zero = Some(shard); } @@ -2752,7 +2898,14 @@ impl Service { tenant_shard_id: shard.tenant_shard_id, node_attached: *shard.intent.get_attached(), node_secondary: shard.intent.get_secondary().to_vec(), - last_error: shard.last_error.lock().unwrap().clone(), + last_error: shard + .last_error + .lock() + .unwrap() + .as_ref() + .map(|e| format!("{e}")) + .unwrap_or("".to_string()) + .clone(), is_reconciling: shard.reconciler.is_some(), is_pending_compute_notification: shard.pending_compute_notification, is_splitting: matches!(shard.splitting, SplitState::Splitting), @@ -2976,11 +3129,14 @@ impl Service { ) -> ( TenantShardSplitResponse, Vec<(TenantShardId, NodeId, ShardStripeSize)>, + Vec, ) { let mut response = TenantShardSplitResponse { new_shards: Vec::new(), }; let mut child_locations = Vec::new(); + let mut waiters = Vec::new(); + { let mut locked = self.inner.write().unwrap(); @@ -3059,14 +3215,112 @@ impl Service { tracing::warn!("Failed to schedule child shard {child}: {e}"); } // In the background, attach secondary locations for the new shards - self.maybe_reconcile_shard(&mut child_state, nodes); + if let Some(waiter) = self.maybe_reconcile_shard(&mut child_state, nodes) { + waiters.push(waiter); + } tenants.insert(child, child_state); response.new_shards.push(child); } } + (response, child_locations, waiters) + } + } - (response, child_locations) + async fn tenant_shard_split_start_secondaries( + &self, + tenant_id: TenantId, + waiters: Vec, + ) { + // Wait for initial reconcile of child shards, this creates the secondary locations + if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await { + // This is not a failure to split: it's some issue reconciling the new child shards, perhaps + // their secondaries couldn't be attached. + tracing::warn!("Failed to reconcile after split: {e}"); + return; + } + + // Take the state lock to discover the attached & secondary intents for all shards + let (attached, secondary) = { + let locked = self.inner.read().unwrap(); + let mut attached = Vec::new(); + let mut secondary = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let Some(node_id) = shard.intent.get_attached() else { + // Unexpected. Race with a PlacementPolicy change? + tracing::warn!( + "No attached node on {tenant_shard_id} immediately after shard split!" + ); + continue; + }; + + let Some(secondary_node_id) = shard.intent.get_secondary().first() else { + // No secondary location. Nothing for us to do. + continue; + }; + + let attached_node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + + let secondary_node = locked + .nodes + .get(secondary_node_id) + .expect("Pageservers may not be deleted while referenced"); + + attached.push((*tenant_shard_id, attached_node.clone())); + secondary.push((*tenant_shard_id, secondary_node.clone())); + } + (attached, secondary) + }; + + if secondary.is_empty() { + // No secondary locations; nothing for us to do + return; + } + + for result in self + .tenant_for_shards_api( + attached, + |tenant_shard_id, client| async move { + client.tenant_heatmap_upload(tenant_shard_id).await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + if let Err(e) = result { + tracing::warn!("Error calling heatmap upload after shard split: {e}"); + return; + } + } + + for result in self + .tenant_for_shards_api( + secondary, + |tenant_shard_id, client| async move { + client + .tenant_secondary_download(tenant_shard_id, Some(Duration::ZERO)) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + if let Err(e) = result { + tracing::warn!("Error calling secondary download after shard split: {e}"); + return; + } } } @@ -3077,7 +3331,12 @@ impl Service { ) -> Result { // TODO: return 503 if we get stuck waiting for this lock // (issue https://github.com/neondatabase/neon/issues/7108) - let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await; + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::ShardSplit, + ) + .await; let new_shard_count = ShardCount::new(split_req.new_shard_count); let new_stripe_size = split_req.new_stripe_size; @@ -3095,8 +3354,8 @@ impl Service { .do_tenant_shard_split(tenant_id, shard_split_params) .await; - match r { - Ok(r) => Ok(r), + let (response, waiters) = match r { + Ok(r) => r, Err(e) => { // Split might be part-done, we must do work to abort it. tracing::warn!("Enqueuing background abort of split on {tenant_id}"); @@ -3109,9 +3368,17 @@ impl Service { }) // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it. .ok(); - Err(e) + return Err(e); } - } + }; + + // The split is now complete. As an optimization, we will trigger all the child shards to upload + // a heatmap immediately, and all their secondary locations to start downloading: this avoids waiting + // for the background heatmap/download interval before secondaries get warm enough to migrate shards + // in [`Self::optimize_all`] + self.tenant_shard_split_start_secondaries(tenant_id, waiters) + .await; + Ok(response) } fn prepare_tenant_shard_split( @@ -3261,7 +3528,7 @@ impl Service { &self, tenant_id: TenantId, params: ShardSplitParams, - ) -> Result { + ) -> Result<(TenantShardSplitResponse, Vec), ApiError> { // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another // request could occur here, deleting or mutating the tenant. begin_shard_split checks that the // parent shards exist as expected, but it would be neater to do the above pre-checks within the @@ -3463,7 +3730,7 @@ impl Service { )); // Replace all the shards we just split with their children: this phase is infallible. - let (response, child_locations) = + let (response, child_locations, waiters) = self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size); // Send compute notifications for all the new shards @@ -3490,7 +3757,7 @@ impl Service { } } - Ok(response) + Ok((response, waiters)) } pub(crate) async fn tenant_shard_migrate( @@ -3595,6 +3862,88 @@ impl Service { Ok(()) } + /// This is for debug/support only: assuming tenant data is already present in S3, we "create" a + /// tenant with a very high generation number so that it will see the existing data. + pub(crate) async fn tenant_import( + &self, + tenant_id: TenantId, + ) -> Result { + // Pick an arbitrary available pageserver to use for scanning the tenant in remote storage + let maybe_node = { + self.inner + .read() + .unwrap() + .nodes + .values() + .find(|n| n.is_available()) + .cloned() + }; + let Some(node) = maybe_node else { + return Err(ApiError::BadRequest(anyhow::anyhow!("No nodes available"))); + }; + + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); + + let scan_result = client + .tenant_scan_remote_storage(tenant_id) + .await + .map_err(|e| passthrough_api_error(&node, e))?; + + // A post-split tenant may contain a mixture of shard counts in remote storage: pick the highest count. + let Some(shard_count) = scan_result + .shards + .iter() + .map(|s| s.tenant_shard_id.shard_count) + .max() + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("No shards found").into(), + )); + }; + + // Ideally we would set each newly imported shard's generation independently, but for correctness it is sufficient + // to + let generation = scan_result + .shards + .iter() + .map(|s| s.generation) + .max() + .expect("We already validated >0 shards"); + + // FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will + // only work if they were using the default stripe size. + let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE; + + let (response, waiters) = self + .do_tenant_create(TenantCreateRequest { + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation, + + shard_parameters: ShardParameters { + count: shard_count, + stripe_size, + }, + placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking + + // There is no way to know what the tenant's config was: revert to defaults + config: TenantConfig::default(), + }) + .await?; + + if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await { + // Since this is a debug/support operation, all kinds of weird issues are possible (e.g. this + // tenant doesn't exist in the control plane), so don't fail the request if it can't fully + // reconcile, as reconciliation includes notifying compute. + tracing::warn!(%tenant_id, "Reconcile not done yet while importing tenant ({e})"); + } + + Ok(response) + } + /// For debug/support: a full JSON dump of TenantShards. Returns a response so that /// we don't have to make TenantShard clonable in the return path. pub(crate) fn tenants_dump(&self) -> Result, ApiError> { @@ -3756,9 +4105,13 @@ impl Service { &self, register_req: NodeRegisterRequest, ) -> Result<(), ApiError> { - let _node_lock = self.node_op_locks.exclusive(register_req.node_id).await; + let _node_lock = trace_exclusive_lock( + &self.node_op_locks, + register_req.node_id, + NodeOperations::Register, + ) + .await; - // Pre-check for an already-existing node { let locked = self.inner.read().unwrap(); if let Some(node) = locked.nodes.get(®ister_req.node_id) { @@ -3845,7 +4198,8 @@ impl Service { availability: Option, scheduling: Option, ) -> Result<(), ApiError> { - let _node_lock = self.node_op_locks.exclusive(node_id).await; + let _node_lock = + trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Configure).await; if let Some(scheduling) = scheduling { // Scheduling is a persistent part of Node: we must write updates to the database before @@ -3978,7 +4332,7 @@ impl Service { // TODO: in the background, we should balance work back onto this pageserver } AvailabilityTransition::Unchanged => { - tracing::info!("Node {} no change during config", node_id); + tracing::debug!("Node {} no change during config", node_id); } } @@ -4053,20 +4407,64 @@ impl Service { Ok(()) } - /// Convenience wrapper around [`TenantShard::maybe_reconcile`] that provides - /// all the references to parts of Self that are needed + /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`], fn maybe_reconcile_shard( &self, shard: &mut TenantShard, nodes: &Arc>, ) -> Option { - shard.maybe_reconcile( + let reconcile_needed = shard.get_reconcile_needed(nodes); + + match reconcile_needed { + ReconcileNeeded::No => return None, + ReconcileNeeded::WaitExisting(waiter) => return Some(waiter), + ReconcileNeeded::Yes => { + // Fall through to try and acquire units for spawning reconciler + } + }; + + let units = match self.reconciler_concurrency.clone().try_acquire_owned() { + Ok(u) => ReconcileUnits::new(u), + Err(_) => { + tracing::info!(tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(), + "Concurrency limited: enqueued for reconcile later"); + if !shard.delayed_reconcile { + match self.delayed_reconcile_tx.try_send(shard.tenant_shard_id) { + Err(TrySendError::Closed(_)) => { + // Weird mid-shutdown case? + } + Err(TrySendError::Full(_)) => { + // It is safe to skip sending our ID in the channel: we will eventually get retried by the background reconcile task. + tracing::warn!( + "Many shards are waiting to reconcile: delayed_reconcile queue is full" + ); + } + Ok(()) => { + shard.delayed_reconcile = true; + } + } + } + + // We won't spawn a reconciler, but we will construct a waiter that waits for the shard's sequence + // number to advance. When this function is eventually called again and succeeds in getting units, + // it will spawn a reconciler that makes this waiter complete. + return Some(shard.future_reconcile_waiter()); + } + }; + + let Ok(gate_guard) = self.gate.enter() else { + // Gate closed: we're shutting down, drop out. + return None; + }; + + shard.spawn_reconciler( &self.result_tx, nodes, &self.compute_hook, &self.config, &self.persistence, - &self.gate, + units, + gate_guard, &self.cancel, ) } @@ -4074,7 +4472,9 @@ impl Service { /// Check all tenants for pending reconciliation work, and reconcile those in need. /// Additionally, reschedule tenants that require it. /// - /// Returns how many reconciliation tasks were started + /// Returns how many reconciliation tasks were started, or `1` if no reconciles were + /// spawned but some _would_ have been spawned if `reconciler_concurrency` units where + /// available. A return value of 0 indicates that everything is fully reconciled already. fn reconcile_all(&self) -> usize { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, _scheduler) = locked.parts_mut(); @@ -4084,10 +4484,19 @@ impl Service { let mut reconciles_spawned = 0; for (tenant_shard_id, shard) in tenants.iter_mut() { - if tenant_shard_id.is_zero() { + if tenant_shard_id.is_shard_zero() { schedule_context = ScheduleContext::default(); } + // Skip checking if this shard is already enqueued for reconciliation + if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 { + // If there is something delayed, then return a nonzero count so that + // callers like reconcile_all_now do not incorrectly get the impression + // that the system is in a quiescent state. + reconciles_spawned = std::cmp::max(1, reconciles_spawned); + continue; + } + // Eventual consistency: if an earlier reconcile job failed, and the shard is still // dirty, spawn another rone if self.maybe_reconcile_shard(shard, &pageservers).is_some() { @@ -4114,33 +4523,77 @@ impl Service { /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at /// the time of scheduling, this function looks for cases where a better-scoring location is available /// according to those same soft constraints. - fn optimize_all(&self) -> usize { - let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, scheduler) = locked.parts_mut(); - let pageservers = nodes.clone(); - - let mut schedule_context = ScheduleContext::default(); - - let mut reconciles_spawned = 0; - - let mut tenant_shards: Vec<&TenantShard> = Vec::new(); - + async fn optimize_all(&self) -> usize { // Limit on how many shards' optmizations each call to this function will execute. Combined // with the frequency of background calls, this acts as an implicit rate limit that runs a small // trickle of optimizations in the background, rather than executing a large number in parallel // when a change occurs. - const MAX_OPTIMIZATIONS_PER_PASS: usize = 2; + const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 2; + + // Synchronous prepare: scan shards for possible scheduling optimizations + let candidate_work = self.optimize_all_plan(); + let candidate_work_len = candidate_work.len(); + + // Asynchronous validate: I/O to pageservers to make sure shards are in a good state to apply validation + let validated_work = self.optimize_all_validate(candidate_work).await; + + let was_work_filtered = validated_work.len() != candidate_work_len; + + // Synchronous apply: update the shards' intent states according to validated optimisations + let mut reconciles_spawned = 0; + let mut optimizations_applied = 0; + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for (tenant_shard_id, optimization) in validated_work { + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + // Shard was dropped between planning and execution; + continue; + }; + if shard.apply_optimization(scheduler, optimization) { + optimizations_applied += 1; + if self.maybe_reconcile_shard(shard, nodes).is_some() { + reconciles_spawned += 1; + } + } + + if optimizations_applied >= MAX_OPTIMIZATIONS_EXEC_PER_PASS { + break; + } + } + + if was_work_filtered { + // If we filtered any work out during validation, ensure we return a nonzero value to indicate + // to callers that the system is not in a truly quiet state, it's going to do some work as soon + // as these validations start passing. + reconciles_spawned = std::cmp::max(reconciles_spawned, 1); + } + + reconciles_spawned + } + + fn optimize_all_plan(&self) -> Vec<(TenantShardId, ScheduleOptimization)> { + let mut schedule_context = ScheduleContext::default(); + + let mut tenant_shards: Vec<&TenantShard> = Vec::new(); + + // How many candidate optimizations we will generate, before evaluating them for readniess: setting + // this higher than the execution limit gives us a chance to execute some work even if the first + // few optimizations we find are not ready. + const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8; let mut work = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); for (tenant_shard_id, shard) in tenants.iter() { - if tenant_shard_id.is_zero() { + if tenant_shard_id.is_shard_zero() { // Reset accumulators on the first shard in a tenant schedule_context = ScheduleContext::default(); + schedule_context.mode = ScheduleMode::Speculative; tenant_shards.clear(); } - if work.len() >= MAX_OPTIMIZATIONS_PER_PASS { + if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS { break; } @@ -4212,18 +4665,105 @@ impl Service { } } - for (tenant_shard_id, optimization) in work { - let shard = tenants - .get_mut(&tenant_shard_id) - .expect("We held lock from place we got this ID"); - shard.apply_optimization(scheduler, optimization); + work + } - if self.maybe_reconcile_shard(shard, &pageservers).is_some() { - reconciles_spawned += 1; + async fn optimize_all_validate( + &self, + candidate_work: Vec<(TenantShardId, ScheduleOptimization)>, + ) -> Vec<(TenantShardId, ScheduleOptimization)> { + // Take a clone of the node map to use outside the lock in async validation phase + let validation_nodes = { self.inner.read().unwrap().nodes.clone() }; + + let mut want_secondary_status = Vec::new(); + + // Validate our plans: this is an async phase where we may do I/O to pageservers to + // check that the state of locations is acceptable to run the optimization, such as + // checking that a secondary location is sufficiently warmed-up to cleanly cut over + // in a live migration. + let mut validated_work = Vec::new(); + for (tenant_shard_id, optimization) in candidate_work { + match optimization.action { + ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: _, + new_attached_node_id, + }) => { + match validation_nodes.get(&new_attached_node_id) { + None => { + // Node was dropped between planning and validation + } + Some(node) => { + if !node.is_available() { + tracing::info!("Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable"); + } else { + // Accumulate optimizations that require fetching secondary status, so that we can execute these + // remote API requests concurrently. + want_secondary_status.push(( + tenant_shard_id, + node.clone(), + optimization, + )); + } + } + } + } + ScheduleOptimizationAction::ReplaceSecondary(_) => { + // No extra checks needed to replace a secondary: this does not interrupt client access + validated_work.push((tenant_shard_id, optimization)) + } + }; + } + + // Call into pageserver API to find out if the destination secondary location is warm enough for a reasonably smooth migration: we + // do this so that we avoid spawning a Reconciler that would have to wait minutes/hours for a destination to warm up: that reconciler + // would hold a precious reconcile semaphore unit the whole time it was waiting for the destination to warm up. + let results = self + .tenant_for_shards_api( + want_secondary_status + .iter() + .map(|i| (i.0, i.1.clone())) + .collect(), + |tenant_shard_id, client| async move { + client.tenant_secondary_status(tenant_shard_id).await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + for ((tenant_shard_id, node, optimization), secondary_status) in + want_secondary_status.into_iter().zip(results.into_iter()) + { + match secondary_status { + Err(e) => { + tracing::info!("Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}"); + } + Ok(progress) => { + // We require secondary locations to have less than 10GiB of downloads pending before we will use + // them in an optimization + const DOWNLOAD_FRESHNESS_THRESHOLD: u64 = 10 * 1024 * 1024 * 1024; + + if progress.bytes_total == 0 + || progress.bytes_total < DOWNLOAD_FRESHNESS_THRESHOLD + && progress.bytes_downloaded != progress.bytes_total + || progress.bytes_total - progress.bytes_downloaded + > DOWNLOAD_FRESHNESS_THRESHOLD + { + tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}"); + } else { + // Location looks ready: proceed + tracing::info!( + "{tenant_shard_id} secondary on {node} is warm enough for migration: {progress:?}" + ); + validated_work.push((tenant_shard_id, optimization)) + } + } } } - reconciles_spawned + validated_work } /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but @@ -4231,10 +4771,12 @@ impl Service { /// put the system into a quiescent state where future background reconciliations won't do anything. pub(crate) async fn reconcile_all_now(&self) -> Result { let reconciles_spawned = self.reconcile_all(); - if reconciles_spawned == 0 { + let reconciles_spawned = if reconciles_spawned == 0 { // Only optimize when we are otherwise idle - self.optimize_all(); - } + self.optimize_all().await + } else { + reconciles_spawned + }; let waiters = { let mut waiters = Vec::new(); @@ -4248,8 +4790,27 @@ impl Service { }; let waiter_count = waiters.len(); - self.await_waiters(waiters, RECONCILE_TIMEOUT).await?; - Ok(waiter_count) + match self.await_waiters(waiters, RECONCILE_TIMEOUT).await { + Ok(()) => {} + Err(ReconcileWaitError::Failed(_, reconcile_error)) + if matches!(*reconcile_error, ReconcileError::Cancel) => + { + // Ignore reconciler cancel errors: this reconciler might have shut down + // because some other change superceded it. We will return a nonzero number, + // so the caller knows they might have to call again to quiesce the system. + } + Err(e) => { + return Err(e); + } + }; + + tracing::info!( + "{} reconciles in reconcile_all, {} waiters", + reconciles_spawned, + waiter_count + ); + + Ok(std::cmp::max(waiter_count, reconciles_spawned)) } pub async fn shutdown(&self) { diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 58b8ef8d5d..dda17f9887 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -7,6 +7,7 @@ use std::{ use crate::{ metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, persistence::TenantShardPersistence, + reconciler::ReconcileUnits, scheduler::{AffinityScore, MaySchedule, ScheduleContext}, }; use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy}; @@ -22,7 +23,7 @@ use utils::{ generation::Generation, id::NodeId, seqwait::{SeqWait, SeqWaitError}, - sync::gate::Gate, + sync::gate::GateGuard, }; use crate::{ @@ -37,12 +38,18 @@ use crate::{ }; /// Serialization helper -fn read_mutex_content(v: &std::sync::Mutex, serializer: S) -> Result +fn read_last_error(v: &std::sync::Mutex>, serializer: S) -> Result where S: serde::ser::Serializer, - T: Clone + std::fmt::Display, + T: std::fmt::Display, { - serializer.collect_str(&v.lock().unwrap()) + serializer.collect_str( + &v.lock() + .unwrap() + .as_ref() + .map(|e| format!("{e}")) + .unwrap_or("".to_string()), + ) } /// In-memory state for a particular tenant shard. @@ -95,6 +102,10 @@ pub(crate) struct TenantShard { /// reconciliation, and timeline creation. pub(crate) splitting: SplitState, + /// If a tenant was enqueued for later reconcile due to hitting concurrency limit, this flag + /// is set. This flag is cleared when the tenant is popped off the delay queue. + pub(crate) delayed_reconcile: bool, + /// Optionally wait for reconciliation to complete up to a particular /// sequence number. #[serde(skip)] @@ -106,15 +117,19 @@ pub(crate) struct TenantShard { #[serde(skip)] pub(crate) error_waiter: std::sync::Arc>, - /// The most recent error from a reconcile on this tenant + /// The most recent error from a reconcile on this tenant. This is a nested Arc + /// because: + /// - ReconcileWaiters need to Arc-clone the overall object to read it later + /// - ReconcileWaitError needs to use an `Arc` because we can construct + /// many waiters for one shard, and the underlying error types are not Clone. /// TODO: generalize to an array of recent events /// TOOD: use a ArcSwap instead of mutex for faster reads? - #[serde(serialize_with = "read_mutex_content")] - pub(crate) last_error: std::sync::Arc>, + #[serde(serialize_with = "read_last_error")] + pub(crate) last_error: std::sync::Arc>>>, /// If we have a pending compute notification that for some reason we weren't able to send, - /// set this to true. If this is set, calls to [`Self::maybe_reconcile`] will run a task to retry - /// sending it. This is the mechanism by which compute notifications are included in the scope + /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes + /// and trigger a Reconciler run. This is the mechanism by which compute notifications are included in the scope /// of state that we publish externally in an eventually consistent way. pub(crate) pending_compute_notification: bool, @@ -288,18 +303,18 @@ pub(crate) struct ReconcilerWaiter { seq_wait: std::sync::Arc>, error_seq_wait: std::sync::Arc>, - error: std::sync::Arc>, + error: std::sync::Arc>>>, seq: Sequence, } #[derive(thiserror::Error, Debug)] -pub enum ReconcileWaitError { +pub(crate) enum ReconcileWaitError { #[error("Timeout waiting for shard {0}")] Timeout(TenantShardId), #[error("shutting down")] Shutdown, #[error("Reconcile error on shard {0}: {1}")] - Failed(TenantShardId, String), + Failed(TenantShardId, Arc), } #[derive(Eq, PartialEq, Debug)] @@ -310,18 +325,28 @@ pub(crate) struct ReplaceSecondary { #[derive(Eq, PartialEq, Debug)] pub(crate) struct MigrateAttachment { - old_attached_node_id: NodeId, - new_attached_node_id: NodeId, + pub(crate) old_attached_node_id: NodeId, + pub(crate) new_attached_node_id: NodeId, } #[derive(Eq, PartialEq, Debug)] -pub(crate) enum ScheduleOptimization { +pub(crate) enum ScheduleOptimizationAction { // Replace one of our secondary locations with a different node ReplaceSecondary(ReplaceSecondary), // Migrate attachment to an existing secondary location MigrateAttachment(MigrateAttachment), } +#[derive(Eq, PartialEq, Debug)] +pub(crate) struct ScheduleOptimization { + // What was the reconcile sequence when we generated this optimization? The optimization + // should only be applied if the shard's sequence is still at this value, in case other changes + // happened between planning the optimization and applying it. + sequence: Sequence, + + pub(crate) action: ScheduleOptimizationAction, +} + impl ReconcilerWaiter { pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> { tokio::select! { @@ -337,7 +362,8 @@ impl ReconcilerWaiter { SeqWaitError::Timeout => unreachable!() })?; - return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone())) + return Err(ReconcileWaitError::Failed(self.tenant_shard_id, + self.error.lock().unwrap().clone().expect("If error_seq_wait was advanced error was set").clone())) } } @@ -353,6 +379,17 @@ pub(crate) struct ReconcilerHandle { cancel: CancellationToken, } +pub(crate) enum ReconcileNeeded { + /// shard either doesn't need reconciliation, or is forbidden from spawning a reconciler + /// in its current state (e.g. shard split in progress, or ShardSchedulingPolicy forbids it) + No, + /// shard has a reconciler running, and its intent hasn't changed since that one was + /// spawned: wait for the existing reconciler rather than spawning a new one. + WaitExisting(ReconcilerWaiter), + /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`] + Yes, +} + /// When a reconcile task completes, it sends this result object /// to be applied to the primary TenantShard. pub(crate) struct ReconcileResult { @@ -396,6 +433,7 @@ impl TenantShard { reconciler: None, splitting: SplitState::Idle, sequence: Sequence(1), + delayed_reconcile: false, waiter: Arc::new(SeqWait::new(Sequence(0))), error_waiter: Arc::new(SeqWait::new(Sequence(0))), last_error: Arc::default(), @@ -647,10 +685,13 @@ impl TenantShard { "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})", self.intent.get_secondary() ); - return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment { - old_attached_node_id: attached, - new_attached_node_id: *preferred_node, - })); + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: attached, + new_attached_node_id: *preferred_node, + }), + }); } } else { tracing::debug!( @@ -708,28 +749,37 @@ impl TenantShard { "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})", self.intent.get_secondary() ); - return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary { - old_node_id: *secondary, - new_node_id: candidate_node, - })); + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { + old_node_id: *secondary, + new_node_id: candidate_node, + }), + }); } } None } + /// Return true if the optimization was really applied: it will not be applied if the optimization's + /// sequence is behind this tenant shard's pub(crate) fn apply_optimization( &mut self, scheduler: &mut Scheduler, optimization: ScheduleOptimization, - ) { + ) -> bool { + if optimization.sequence != self.sequence { + return false; + } + metrics::METRICS_REGISTRY .metrics_group .storage_controller_schedule_optimization .inc(); - match optimization { - ScheduleOptimization::MigrateAttachment(MigrateAttachment { + match optimization.action { + ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { old_attached_node_id, new_attached_node_id, }) => { @@ -737,7 +787,7 @@ impl TenantShard { self.intent .promote_attached(scheduler, new_attached_node_id); } - ScheduleOptimization::ReplaceSecondary(ReplaceSecondary { + ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { old_node_id, new_node_id, }) => { @@ -745,6 +795,8 @@ impl TenantShard { self.intent.push_secondary(scheduler, new_node_id); } } + + true } /// Query whether the tenant's observed state for attached node matches its intent state, and if so, @@ -831,16 +883,10 @@ impl TenantShard { #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] - pub(crate) fn maybe_reconcile( + pub(crate) fn get_reconcile_needed( &mut self, - result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, - compute_hook: &Arc, - service_config: &service::Config, - persistence: &Arc, - gate: &Gate, - cancel: &CancellationToken, - ) -> Option { + ) -> ReconcileNeeded { // If there are any ambiguous observed states, and the nodes they refer to are available, // we should reconcile to clean them up. let mut dirty_observed = false; @@ -862,8 +908,8 @@ impl TenantShard { active_nodes_dirty || dirty_observed || self.pending_compute_notification; if !do_reconcile { - tracing::info!("Not dirty, no reconciliation needed."); - return None; + tracing::debug!("Not dirty, no reconciliation needed."); + return ReconcileNeeded::No; } // If we are currently splitting, then never start a reconciler task: the splitting logic @@ -871,7 +917,7 @@ impl TenantShard { // up top, so that we only log this message if we would otherwise have done a reconciliation. if !matches!(self.splitting, SplitState::Idle) { tracing::info!("Refusing to reconcile, splitting in progress"); - return None; + return ReconcileNeeded::No; } // Reconcile already in flight for the current sequence? @@ -881,7 +927,7 @@ impl TenantShard { "Reconciliation already in progress for sequence {:?}", self.sequence, ); - return Some(ReconcilerWaiter { + return ReconcileNeeded::WaitExisting(ReconcilerWaiter { tenant_shard_id: self.tenant_shard_id, seq_wait: self.waiter.clone(), error_seq_wait: self.error_waiter.clone(), @@ -900,10 +946,67 @@ impl TenantShard { // We only reach this point if there is work to do and we're going to skip // doing it: warn it obvious why this tenant isn't doing what it ought to. tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy); - return None; + return ReconcileNeeded::No; } } + ReconcileNeeded::Yes + } + + /// Ensure the sequence number is set to a value where waiting for this value will make us wait + /// for the next reconcile: i.e. it is ahead of all completed or running reconcilers. + /// + /// Constructing a ReconcilerWaiter with the resulting sequence number gives the property + /// that the waiter will not complete until some future Reconciler is constructed and run. + fn ensure_sequence_ahead(&mut self) { + // Find the highest sequence for which a Reconciler has previously run or is currently + // running + let max_seen = std::cmp::max( + self.reconciler + .as_ref() + .map(|r| r.sequence) + .unwrap_or(Sequence(0)), + std::cmp::max(self.waiter.load(), self.error_waiter.load()), + ); + + if self.sequence <= max_seen { + self.sequence = max_seen.next(); + } + } + + /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet. + /// + /// This is appropriate when you can't spawn a reconciler (e.g. due to resource limits), but + /// you would like to wait on the next reconciler that gets spawned in the background. + pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter { + self.ensure_sequence_ahead(); + + ReconcilerWaiter { + tenant_shard_id: self.tenant_shard_id, + seq_wait: self.waiter.clone(), + error_seq_wait: self.error_waiter.clone(), + error: self.last_error.clone(), + seq: self.sequence, + } + } + + #[allow(clippy::too_many_arguments)] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn spawn_reconciler( + &mut self, + result_tx: &tokio::sync::mpsc::UnboundedSender, + pageservers: &Arc>, + compute_hook: &Arc, + service_config: &service::Config, + persistence: &Arc, + units: ReconcileUnits, + gate_guard: GateGuard, + cancel: &CancellationToken, + ) -> Option { + // Reconcile in flight for a stale sequence? Our sequence's task will wait for it before + // doing our sequence's work. + let old_handle = self.reconciler.take(); + // Build list of nodes from which the reconciler should detach let mut detach = Vec::new(); for node_id in self.observed.locations.keys() { @@ -919,18 +1022,9 @@ impl TenantShard { } } - // Reconcile in flight for a stale sequence? Our sequence's task will wait for it before - // doing our sequence's work. - let old_handle = self.reconciler.take(); - - let Ok(gate_guard) = gate.enter() else { - // Shutting down, don't start a reconciler - return None; - }; - // Advance the sequence before spawning a reconciler, so that sequence waiters // can distinguish between before+after the reconcile completes. - self.sequence = self.sequence.next(); + self.ensure_sequence_ahead(); let reconciler_cancel = cancel.child_token(); let reconciler_intent = TargetState::from_intent(pageservers, &self.intent); @@ -945,6 +1039,7 @@ impl TenantShard { compute_hook: compute_hook.clone(), service_config: service_config.clone(), _gate_guard: gate_guard, + _resource_units: units, cancel: reconciler_cancel.clone(), persistence: persistence.clone(), compute_notify_failure: false, @@ -1011,16 +1106,18 @@ impl TenantShard { status: outcome_label, }); - result_tx - .send(ReconcileResult { - sequence: reconcile_seq, - result, - tenant_shard_id: reconciler.tenant_shard_id, - generation: reconciler.generation, - observed: reconciler.observed, - pending_compute_notification: reconciler.compute_notify_failure, - }) - .ok(); + // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might + // try and schedule more work in response to our result. + let result = ReconcileResult { + sequence: reconcile_seq, + result, + tenant_shard_id: reconciler.tenant_shard_id, + generation: reconciler.generation, + observed: reconciler.observed, + pending_compute_notification: reconciler.compute_notify_failure, + }; + + result_tx.send(result).ok(); } .instrument(reconciler_span), ); @@ -1089,6 +1186,13 @@ impl TenantShard { &self.scheduling_policy } + pub(crate) fn set_last_error(&mut self, sequence: Sequence, error: ReconcileError) { + // Ordering: always set last_error before advancing sequence, so that sequence + // waiters are guaranteed to see a Some value when they see an error. + *(self.last_error.lock().unwrap()) = Some(Arc::new(error)); + self.error_waiter.advance(sequence); + } + pub(crate) fn from_persistent( tsp: TenantShardPersistence, intent: IntentState, @@ -1111,6 +1215,7 @@ impl TenantShard { error_waiter: Arc::new(SeqWait::new(Sequence::initial())), last_error: Arc::default(), pending_compute_notification: false, + delayed_reconcile: false, scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), }) } @@ -1347,10 +1452,13 @@ pub(crate) mod tests { // would be no other shards from the same tenant, and request to do so. assert_eq!( optimization_a, - Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment { - old_attached_node_id: NodeId(1), - new_attached_node_id: NodeId(2) - })) + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(1), + new_attached_node_id: NodeId(2) + }) + }) ); // Note that these optimizing two shards in the same tenant with the same ScheduleContext is @@ -1361,10 +1469,13 @@ pub(crate) mod tests { let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context); assert_eq!( optimization_b, - Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment { - old_attached_node_id: NodeId(1), - new_attached_node_id: NodeId(3) - })) + Some(ScheduleOptimization { + sequence: shard_b.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(1), + new_attached_node_id: NodeId(3) + }) + }) ); // Applying these optimizations should result in the end state proposed @@ -1408,10 +1519,13 @@ pub(crate) mod tests { // same tenant should generate an optimization to move one away assert_eq!( optimization_a, - Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary { - old_node_id: NodeId(3), - new_node_id: NodeId(4) - })) + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { + old_node_id: NodeId(3), + new_node_id: NodeId(4) + }) + }) ); shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap()); diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 9dd66fe636..a883d94f73 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -14,10 +14,18 @@ class ComputeReconfigure: self.server = server self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach" self.workloads = {} + self.on_notify = None def register_workload(self, workload): self.workloads[workload.tenant_id] = workload + def register_on_notify(self, fn): + """ + Add some extra work during a notification, like sleeping to slow things down, or + logging what was notified. + """ + self.on_notify = fn + @pytest.fixture(scope="function") def compute_reconfigure_listener(make_httpserver): @@ -43,6 +51,9 @@ def compute_reconfigure_listener(make_httpserver): body: dict[str, Any] = request.json log.info(f"notify-attach request: {body}") + if self.on_notify is not None: + self.on_notify(body) + try: workload = self.workloads[TenantId(body["tenant_id"])] except KeyError: diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index c615dd154f..7d34e12ca3 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -129,7 +129,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "pageserver_getpage_reconstruct_seconds_sum", *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], *histogram("pageserver_smgr_query_seconds_global"), - *histogram("pageserver_read_num_fs_layers"), + *histogram("pageserver_layers_visited_per_read_global"), *histogram("pageserver_getpage_get_reconstruct_data_seconds"), *histogram("pageserver_wait_lsn_seconds"), *histogram("pageserver_remote_operation_seconds"), diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 0e4a58c099..240b6ee199 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -499,6 +499,7 @@ class NeonEnvBuilder: self.config_init_force: Optional[str] = None self.top_output_dir = top_output_dir self.control_plane_compute_hook_api: Optional[str] = None + self.storage_controller_config: Optional[dict[Any, Any]] = None self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine @@ -507,6 +508,16 @@ class NeonEnvBuilder: self.pageserver_get_vectored_impl = "vectored" log.debug('Overriding pageserver get_vectored_impl config to "vectored"') + self.pageserver_get_impl: Optional[str] = None + if os.getenv("PAGESERVER_GET_IMPL", "") == "vectored": + self.pageserver_get_impl = "vectored" + log.debug('Overriding pageserver get_impl config to "vectored"') + + self.pageserver_validate_vectored_get: Optional[bool] = None + if (validate := os.getenv("PAGESERVER_VALIDATE_VEC_GET")) is not None: + self.pageserver_validate_vectored_get = bool(validate) + log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"') + assert test_name.startswith( "test_" ), "Unexpectedly instantiated from outside a test function" @@ -1011,6 +1022,7 @@ class NeonEnv: self.pg_distrib_dir = config.pg_distrib_dir self.endpoint_counter = 0 self.pageserver_config_override = config.pageserver_config_override + self.storage_controller_config = config.storage_controller_config # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. @@ -1056,6 +1068,9 @@ class NeonEnv: if self.control_plane_compute_hook_api is not None: cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api + if self.storage_controller_config is not None: + cfg["storage_controller"] = self.storage_controller_config + # Create config for pageserver http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust" @@ -1078,6 +1093,10 @@ class NeonEnv: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine if config.pageserver_get_vectored_impl is not None: ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl + if config.pageserver_get_impl is not None: + ps_cfg["get_impl"] = config.pageserver_get_impl + if config.pageserver_validate_vectored_get is not None: + ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get # Create a corresponding NeonPageserver object self.pageservers.append( @@ -1085,7 +1104,6 @@ class NeonEnv: self, ps_id, port=pageserver_port, - config_override=self.pageserver_config_override, ) ) cfg["pageservers"].append(ps_cfg) @@ -1120,12 +1138,9 @@ class NeonEnv: # bounce through retries on startup self.storage_controller.start() - def storage_controller_ready(): - assert self.storage_controller.ready() is True - # Wait for storage controller readiness to prevent unnecessary post start-up # reconcile. - wait_until(30, 1, storage_controller_ready) + self.storage_controller.wait_until_ready() # Start up broker, pageserver and all safekeepers futs = [] @@ -1568,6 +1583,11 @@ class NeonCli(AbstractNeonCli): res.check_returncode() return tenant_id, timeline_id + def import_tenant(self, tenant_id: TenantId): + args = ["tenant", "import", "--tenant-id", str(tenant_id)] + res = self.raw_cli(args) + res.check_returncode() + def set_default(self, tenant_id: TenantId): """ Update default tenant for future operations that require tenant_id. @@ -1781,6 +1801,7 @@ class NeonCli(AbstractNeonCli): hot_standby: bool = False, lsn: Optional[Lsn] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1804,6 +1825,8 @@ class NeonCli(AbstractNeonCli): args.extend(["--hot-standby", "true"]) if pageserver_id is not None: args.extend(["--pageserver-id", str(pageserver_id)]) + if allow_multiple: + args.extend(["--allow-multiple"]) res = self.raw_cli(args) res.check_returncode() @@ -1815,6 +1838,7 @@ class NeonCli(AbstractNeonCli): safekeepers: Optional[List[int]] = None, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1829,6 +1853,8 @@ class NeonCli(AbstractNeonCli): args.append(endpoint_id) if pageserver_id is not None: args.extend(["--pageserver-id", str(pageserver_id)]) + if allow_multiple: + args.extend(["--allow-multiple"]) res = self.raw_cli(args) res.check_returncode() @@ -1938,6 +1964,55 @@ class Pagectl(AbstractNeonCli): return IndexPartDump.from_json(parsed) +class LogUtils: + """ + A mixin class which provides utilities for inspecting the logs of a service. + """ + + def __init__(self, logfile: Path) -> None: + self.logfile = logfile + + def assert_log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Tuple[str, LogCursor]: + """Convenient for use inside wait_until()""" + + res = self.log_contains(pattern, offset=offset) + assert res is not None + return res + + def log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Optional[Tuple[str, LogCursor]]: + """Check that the log contains a line that matches the given regex""" + logfile = self.logfile + if not logfile.exists(): + log.warning(f"Skipping log check: {logfile} does not exist") + return None + + contains_re = re.compile(pattern) + + # XXX: Our rust logging machinery buffers the messages, so if you + # call this function immediately after it's been logged, there is + # no guarantee it is already present in the log file. This hasn't + # been a problem in practice, our python tests are not fast enough + # to hit that race condition. + skip_until_line_no = 0 if offset is None else offset._line_no + cur_line_no = 0 + with logfile.open("r") as f: + for line in f: + if cur_line_no < skip_until_line_no: + cur_line_no += 1 + continue + elif contains_re.search(line): + # found it! + cur_line_no += 1 + return (line, LogCursor(cur_line_no)) + else: + cur_line_no += 1 + return None + + class StorageControllerApiException(Exception): def __init__(self, message, status_code: int): super().__init__(message) @@ -1945,12 +2020,13 @@ class StorageControllerApiException(Exception): self.status_code = status_code -class NeonStorageController(MetricsGetter): +class NeonStorageController(MetricsGetter, LogUtils): def __init__(self, env: NeonEnv, auth_enabled: bool): self.env = env self.running = False self.auth_enabled = auth_enabled self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS + self.logfile = self.workdir / "storage_controller.log" def start(self): assert not self.running @@ -2024,6 +2100,15 @@ class NeonStorageController(MetricsGetter): else: raise RuntimeError(f"Unexpected status {status} from readiness endpoint") + def wait_until_ready(self): + t1 = time.time() + + def storage_controller_ready(): + assert self.ready() is True + + wait_until(30, 1, storage_controller_ready) + return time.time() - t1 + def attach_hook_issue( self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int ) -> int: @@ -2111,7 +2196,7 @@ class NeonStorageController(MetricsGetter): shard_count: Optional[int] = None, shard_stripe_size: Optional[int] = None, tenant_config: Optional[Dict[Any, Any]] = None, - placement_policy: Optional[str] = None, + placement_policy: Optional[Union[Dict[Any, Any] | str]] = None, ): """ Use this rather than pageserver_api() when you need to include shard parameters @@ -2200,6 +2285,13 @@ class NeonStorageController(MetricsGetter): headers=self.headers(TokenScope.ADMIN), ) + def tenant_import(self, tenant_id: TenantId): + self.request( + "POST", + f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import", + headers=self.headers(TokenScope.ADMIN), + ) + def reconcile_all(self): r = self.request( "POST", @@ -2214,10 +2306,21 @@ class NeonStorageController(MetricsGetter): def reconcile_until_idle(self, timeout_secs=30): start_at = time.time() n = 1 + delay_sec = 0.5 + delay_max = 5 while n > 0: n = self.reconcile_all() - if time.time() - start_at > timeout_secs: + if n == 0: + break + elif time.time() - start_at > timeout_secs: raise RuntimeError("Timeout in reconcile_until_idle") + else: + # Don't call again right away: if we're waiting for many reconciles that + # are blocked on the concurrency limit, it slows things down to call + # reconcile_all frequently. + time.sleep(delay_sec) + delay_sec *= 2 + delay_sec = min(delay_sec, delay_max) def consistency_check(self): """ @@ -2247,6 +2350,10 @@ class NeonStorageController(MetricsGetter): log.info(f"Got failpoints request response code {res.status_code}") res.raise_for_status() + @property + def workdir(self) -> Path: + return self.env.repo_dir + def __enter__(self) -> "NeonStorageController": return self @@ -2264,24 +2371,21 @@ class LogCursor: _line_no: int -class NeonPageserver(PgProtocol): +class NeonPageserver(PgProtocol, LogUtils): """ An object representing a running pageserver. """ TEMP_FILE_SUFFIX = "___temp" - def __init__( - self, env: NeonEnv, id: int, port: PageserverPort, config_override: Optional[str] = None - ): + def __init__(self, env: NeonEnv, id: int, port: PageserverPort): super().__init__(host="localhost", port=port.pg, user="cloud_admin") self.env = env self.id = id self.running = False self.service_port = port - self.config_override = config_override self.version = env.get_binary_version("pageserver") - + self.logfile = self.workdir / "pageserver.log" # After a test finishes, we will scrape the log to see if there are any # unexpected error messages. If your test expects an error, add it to # 'allowed_errors' in the test with something like: @@ -2291,20 +2395,24 @@ class NeonPageserver(PgProtocol): # The entries in the list are regular experessions. self.allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS) - def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path: + def timeline_dir( + self, + tenant_shard_id: Union[TenantId, TenantShardId], + timeline_id: Optional[TimelineId] = None, + ) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" if timeline_id is None: - return self.tenant_dir(tenant_id) / "timelines" - return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id) + return self.tenant_dir(tenant_shard_id) / "timelines" + return self.tenant_dir(tenant_shard_id) / "timelines" / str(timeline_id) def tenant_dir( self, - tenant_id: Optional[TenantId] = None, + tenant_shard_id: Optional[Union[TenantId, TenantShardId]] = None, ) -> Path: """Get a tenant directory's path based on the repo directory of the test environment""" - if tenant_id is None: + if tenant_shard_id is None: return self.workdir / "tenants" - return self.workdir / "tenants" / str(tenant_id) + return self.workdir / "tenants" / str(tenant_shard_id) def start( self, @@ -2417,44 +2525,6 @@ class NeonPageserver(PgProtocol): value = self.http_client().get_metric_value(metric) assert value == 0, f"Nonzero {metric} == {value}" - def assert_log_contains( - self, pattern: str, offset: None | LogCursor = None - ) -> Tuple[str, LogCursor]: - """Convenient for use inside wait_until()""" - - res = self.log_contains(pattern, offset=offset) - assert res is not None - return res - - def log_contains( - self, pattern: str, offset: None | LogCursor = None - ) -> Optional[Tuple[str, LogCursor]]: - """Check that the pageserver log contains a line that matches the given regex""" - logfile = self.workdir / "pageserver.log" - if not logfile.exists(): - log.warning(f"Skipping log check: {logfile} does not exist") - return None - - contains_re = re.compile(pattern) - - # XXX: Our rust logging machinery buffers the messages, so if you - # call this function immediately after it's been logged, there is - # no guarantee it is already present in the log file. This hasn't - # been a problem in practice, our python tests are not fast enough - # to hit that race condition. - skip_until_line_no = 0 if offset is None else offset._line_no - cur_line_no = 0 - with logfile.open("r") as f: - for line in f: - if cur_line_no < skip_until_line_no: - cur_line_no += 1 - continue - if contains_re.search(line): - # found it! - cur_line_no += 1 - return (line, LogCursor(cur_line_no)) - return None - def tenant_attach( self, tenant_id: TenantId, @@ -2489,8 +2559,10 @@ class NeonPageserver(PgProtocol): client = self.http_client() return client.tenant_location_conf(tenant_id, config, **kwargs) - def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]: - path = self.tenant_dir(tenant_id) / "config-v1" + def read_tenant_location_conf( + self, tenant_shard_id: Union[TenantId, TenantShardId] + ) -> dict[str, Any]: + path = self.tenant_dir(tenant_shard_id) / "config-v1" log.info(f"Reading location conf from {path}") bytes = open(path, "r").read() try: @@ -3233,6 +3305,7 @@ class Endpoint(PgProtocol): lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, pageserver_id: Optional[int] = None, + allow_multiple: bool = False, ) -> "Endpoint": """ Create a new Postgres endpoint. @@ -3255,6 +3328,7 @@ class Endpoint(PgProtocol): pg_port=self.pg_port, http_port=self.http_port, pageserver_id=pageserver_id, + allow_multiple=allow_multiple, ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = os.path.join(self.env.repo_dir, path) @@ -3271,7 +3345,10 @@ class Endpoint(PgProtocol): return self def start( - self, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None + self, + remote_ext_config: Optional[str] = None, + pageserver_id: Optional[int] = None, + allow_multiple: bool = False, ) -> "Endpoint": """ Start the Postgres instance. @@ -3287,6 +3364,7 @@ class Endpoint(PgProtocol): safekeepers=self.active_safekeepers, remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, + allow_multiple=allow_multiple, ) self.running = True @@ -3416,6 +3494,7 @@ class Endpoint(PgProtocol): config_lines: Optional[List[str]] = None, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, ) -> "Endpoint": """ Create an endpoint, apply config, and start Postgres. @@ -3431,7 +3510,12 @@ class Endpoint(PgProtocol): hot_standby=hot_standby, lsn=lsn, pageserver_id=pageserver_id, - ).start(remote_ext_config=remote_ext_config, pageserver_id=pageserver_id) + allow_multiple=allow_multiple, + ).start( + remote_ext_config=remote_ext_config, + pageserver_id=pageserver_id, + allow_multiple=allow_multiple, + ) log.info(f"Postgres startup took {time.time() - started_at} seconds") @@ -3694,13 +3778,15 @@ class S3Scrubber: log.warning(f"Scrub environment: {env}") log.warning(f"Output at: {output_path}") - raise RuntimeError("Remote storage scrub failed") + raise RuntimeError(f"Scrubber failed while running {args}") assert stdout is not None return stdout def scan_metadata(self) -> Any: - stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30) + stdout = self.scrubber_cli( + ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30 + ) try: return json.loads(stdout) @@ -3709,6 +3795,13 @@ class S3Scrubber: log.error(stdout) raise + def tenant_snapshot(self, tenant_id: TenantId, output_path: Path): + stdout = self.scrubber_cli( + ["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)], + timeout=30, + ) + log.info(f"tenant-snapshot output: {stdout}") + def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path: """Compute the path to a working directory for an individual test.""" diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index b899b0dac8..231ffd898e 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -293,7 +293,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter): lazy: Optional[bool] = None, ): body = location_conf.copy() - body["tenant_id"] = str(tenant_id) params = {} if flush_ms is not None: diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 60591d8d46..83f9f26837 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -252,8 +252,11 @@ class S3Storage: log.info(f"deleted {cnt} objects from remote storage") + def tenants_path(self) -> str: + return f"{self.prefix_in_bucket}/tenants" + def tenant_path(self, tenant_id: TenantId) -> str: - return f"{self.prefix_in_bucket}/tenants/{tenant_id}" + return f"{self.tenants_path()}/{tenant_id}" def heatmap_key(self, tenant_id: TenantId) -> str: return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}" @@ -262,6 +265,9 @@ class S3Storage: r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id)) return json.loads(r["Body"].read().decode("utf-8")) + def mock_remote_tenant_path(self, tenant_id: TenantId): + assert self.real is False + RemoteStorage = Union[LocalFsStorage, S3Storage] diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py index 80c9b9ce9a..b5458b5c26 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/types.py @@ -156,7 +156,11 @@ class TenantShardId: raise ValueError(f"Invalid TenantShardId '{input}'") def __str__(self): - return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}" + if self.shard_count > 0: + return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}" + else: + # Unsharded case: equivalent of Rust TenantShardId::unsharded(tenant_id) + return str(self.tenant_id) def __repr__(self): return self.__str__() diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py new file mode 100644 index 0000000000..17dc96dabe --- /dev/null +++ b/test_runner/performance/test_storage_controller_scale.py @@ -0,0 +1,198 @@ +import concurrent.futures +import random +import time + +import pytest +from fixtures.compute_reconfigure import ComputeReconfigure +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pg_version import PgVersion +from fixtures.types import TenantId, TenantShardId, TimelineId + + +@pytest.mark.timeout(3600) # super long running test: should go down as we optimize +def test_storage_controller_many_tenants( + neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure +): + """ + Check that we cope well with a not-totally-trivial number of tenants. + + This is checking for: + - Obvious concurrency bugs from issuing many tenant creations/modifications + concurrently. + - Obvious scaling bugs like O(N^2) scaling that would be so slow that even + a basic test starts failing from slowness. + + This is _not_ a comprehensive scale test: just a basic sanity check that + we don't fall over for a thousand shards. + """ + + neon_env_builder.num_pageservers = 5 + neon_env_builder.storage_controller_config = { + # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. + # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to + # guard against regressions in restart time. + "max_unavailable": "300s" + } + neon_env_builder.control_plane_compute_hook_api = ( + compute_reconfigure_listener.control_plane_compute_hook_api + ) + + # A small sleep on each call into the notify hook, to simulate the latency of doing a database write + compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01)) + + env = neon_env_builder.init_start() + + # We will intentionally stress reconciler concurrrency, which triggers a warning when lots + # of shards are hitting the delayed path. + env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile") + + for ps in env.pageservers: + # This can happen because when we do a loop over all pageservers and mark them offline/active, + # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of + # bumping generation before other attachments are detached. + # + # We could clean this up by making reconcilers respect the .observed of their predecessor, if + # we spawn with a wait for the predecessor. + ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") + + # Storage controller is allowed to drop pageserver requests when the cancellation token + # for a Reconciler fires. + ps.allowed_errors.append(".*request was dropped before completing.*") + + # Total tenants + tenant_count = 4000 + + # Shards per tenant + shard_count = 2 + stripe_size = 1024 + + tenants = set(TenantId.generate() for _i in range(0, tenant_count)) + + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + + def check_memory(): + # Shards should be cheap_ in memory, as we will have very many of them + expect_memory_per_shard = 128 * 1024 + + rss = env.storage_controller.get_metric_value("process_resident_memory_bytes") + assert rss is not None + log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)") + assert rss < expect_memory_per_shard * shard_count * tenant_count + + # We use a fixed seed to make the test somewhat reproducible: we want a randomly + # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run. + rng = random.Random(1234) + + # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore + # permits, to ensure that we are exercising stressing that. + api_concurrency = 135 + + # We will create tenants directly via API, not via neon_local, to avoid any false + # serialization of operations in neon_local (it e.g. loads/saves a config file on each call) + with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor: + futs = [] + t1 = time.time() + for tenant_id in tenants: + f = executor.submit( + env.storage_controller.tenant_create, + tenant_id, + shard_count, + stripe_size, + placement_policy={"Attached": 1}, + ) + futs.append(f) + + # Wait for creations to finish + for f in futs: + f.result() + log.info( + f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s" + ) + + run_ops = api_concurrency * 4 + assert run_ops < len(tenants) + op_tenants = list(tenants)[0:run_ops] + + # Generate a mixture of operations and dispatch them all concurrently + futs = [] + for tenant_id in op_tenants: + op = rng.choice([0, 1, 2]) + if op == 0: + # A fan-out write operation to all shards in a tenant (timeline creation) + f = executor.submit( + virtual_ps_http.timeline_create, + PgVersion.NOT_SET, + tenant_id, + TimelineId.generate(), + ) + elif op == 1: + # A reconciler operation: migrate a shard. + shard_number = rng.randint(0, shard_count - 1) + tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count) + dest_ps_id = rng.choice([ps.id for ps in env.pageservers]) + f = executor.submit( + env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id + ) + elif op == 2: + # A passthrough read to shard zero + f = executor.submit(virtual_ps_http.tenant_status, tenant_id) + + futs.append(f) + + # Wait for mixed ops to finish + for f in futs: + f.result() + + # Consistency check is safe here: all the previous operations waited for reconcile before completing + env.storage_controller.consistency_check() + check_memory() + + # This loop waits for reconcile_all to indicate no pending work, and then calls it once more to time + # how long the call takes when idle: this iterates over shards while doing no I/O and should be reliably fast: if + # it isn't, that's a sign that we have made some algorithmic mistake (e.g. O(N**2) scheduling) + # + # We do not require that the system is quiescent already here, although at present in this point in the test + # that may be the case. + while True: + t1 = time.time() + reconcilers = env.storage_controller.reconcile_all() + if reconcilers == 0: + # Time how long a no-op background reconcile takes: this measures how long it takes to + # loop over all the shards looking for work to do. + runtime = time.time() - t1 + log.info(f"No-op call to reconcile_all took {runtime}s") + assert runtime < 1 + break + + # Restart the storage controller + env.storage_controller.stop() + env.storage_controller.start() + + # See how long the controller takes to pass its readiness check. This should be fast because + # all the nodes are online: offline pageservers are the only thing that's allowed to delay + # startup. + readiness_period = env.storage_controller.wait_until_ready() + assert readiness_period < 5 + + # Consistency check is safe here: the storage controller's restart should not have caused any reconcilers + # to run, as it was in a stable state before restart. If it did, that's a bug. + env.storage_controller.consistency_check() + check_memory() + + # Restart pageservers: this exercises the /re-attach API + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn, + # as they were not offline long enough to trigger any scheduling changes. + env.storage_controller.consistency_check() + check_memory() + + # Stop the storage controller before tearing down fixtures, because it otherwise might log + # errors trying to call our `ComputeReconfigure`. + env.storage_controller.stop() diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 909d25980b..59461cc095 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -190,6 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "trace_read_requests": True, "walreceiver_connect_timeout": "13m", "image_layer_creation_check_threshold": 1, + "switch_to_aux_file_v2": True, } ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py index c808fa0f54..82a3a05c2b 100644 --- a/test_runner/regress/test_bad_connection.py +++ b/test_runner/regress/test_bad_connection.py @@ -1,10 +1,13 @@ import random import time +import psycopg2.errors +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +@pytest.mark.timeout(600) def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.pageserver.allowed_errors.append(".*simulated connection error.*") @@ -20,12 +23,20 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): pg_conn = endpoint.connect() cur = pg_conn.cursor() + def execute_retry_on_timeout(query): + while True: + try: + cur.execute(query) + return + except psycopg2.errors.QueryCanceled: + log.info(f"Query '{query}' timed out - retrying") + # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point # of this test. - cur.execute("CREATE TABLE foo (t text)") - cur.execute( + execute_retry_on_timeout("CREATE TABLE foo (t text)") + execute_retry_on_timeout( """ INSERT INTO foo SELECT 'long string to consume some space' || g @@ -34,7 +45,7 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): ) # Verify that the table is larger than shared_buffers - cur.execute( + execute_retry_on_timeout( """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' @@ -45,16 +56,16 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) - cur.execute("SELECT count(*) FROM foo") + execute_retry_on_timeout("SELECT count(*) FROM foo") assert cur.fetchone() == (100000,) end_time = time.time() + 30 times_executed = 0 while time.time() < end_time: if random.random() < 0.5: - cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')") + execute_retry_on_timeout("INSERT INTO foo VALUES ('stas'), ('heikki')") else: - cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") + execute_retry_on_timeout("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") cur.fetchall() times_executed += 1 log.info(f"Workload executed {times_executed} times") diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 5b69649007..9fe9f77fea 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -1,6 +1,7 @@ import random import threading import time +from concurrent.futures import ThreadPoolExecutor from typing import List import pytest @@ -405,6 +406,29 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder): assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1 +def test_branching_while_stuck_find_gc_cutoffs(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + client = env.pageserver.http_client() + + failpoint = "Timeline::find_gc_cutoffs-pausable" + + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + completion = exec.submit(client.timeline_gc, env.initial_tenant, env.initial_timeline, None) + + wait_until_paused(env, failpoint) + + env.neon_cli.create_branch( + tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch" + ) + + client.configure_failpoints((failpoint, "off")) + + completion.result() + + def wait_until_paused(env: NeonEnv, failpoint: str): found = False msg = f"at failpoint {failpoint}" diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 804ad135ce..1279c1bf81 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -17,11 +17,16 @@ from fixtures.types import TenantId, TimelineId # Test restarting page server, while safekeeper and compute node keep # running. def test_local_corruption(neon_env_builder: NeonEnvBuilder): + if neon_env_builder.pageserver_get_impl == "vectored": + reconstruct_function_name = "get_values_reconstruct_data" + else: + reconstruct_function_name = "get_value_reconstruct_data" + env = neon_env_builder.init_start() env.pageserver.allowed_errors.extend( [ - ".*get_value_reconstruct_data for layer .*", + f".*{reconstruct_function_name} for layer .*", ".*could not find data for key.*", ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", @@ -84,7 +89,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers - with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err: + with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err: pg2.start() log.info( f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}" diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py new file mode 100644 index 0000000000..43a3323462 --- /dev/null +++ b/test_runner/regress/test_compaction.py @@ -0,0 +1,192 @@ +import json +import os +from typing import Optional + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.workload import Workload + +AGGRESIVE_COMPACTION_TENANT_CONF = { + # Disable gc and compaction. The test runs compaction manually. + "gc_period": "0s", + "compaction_period": "0s", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024**2, + # Compact small layers + "compaction_target_size": 1024**2, + "image_creation_threshold": 2, +} + + +@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") +def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder): + """ + This is a smoke test that compaction kicks in. The workload repeatedly churns + a small number of rows and manually instructs the pageserver to run compaction + between iterations. At the end of the test validate that the average number of + layers visited to gather reconstruct data for a given key is within the empirically + observed bounds. + """ + + # Effectively disable the page cache to rely only on image layers + # to shorten reads. + neon_env_builder.pageserver_config_override = """ +page_cache_size=10 +""" + + env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 10000 + churn_rounds = 100 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + for i in range(1, churn_rounds + 1): + if i % 10 == 0: + log.info(f"Running churn round {i}/{churn_rounds} ...") + + workload.churn_rows(row_count, env.pageserver.id) + ps_http.timeline_compact(tenant_id, timeline_id) + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + + log.info("Checking layer access metrics ...") + + layer_access_metric_names = [ + "pageserver_layers_visited_per_read_global_sum", + "pageserver_layers_visited_per_read_global_count", + "pageserver_layers_visited_per_read_global_bucket", + "pageserver_layers_visited_per_vectored_read_global_sum", + "pageserver_layers_visited_per_vectored_read_global_count", + "pageserver_layers_visited_per_vectored_read_global_bucket", + ] + + metrics = env.pageserver.http_client().get_metrics() + for name in layer_access_metric_names: + layer_access_metrics = metrics.query_all(name) + log.info(f"Got metrics: {layer_access_metrics}") + + non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum") + non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count") + non_vectored_average = non_vectored_sum.value / non_vectored_count.value + + vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum") + vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count") + vectored_average = vectored_sum.value / vectored_count.value + + log.info(f"{non_vectored_average=} {vectored_average=}") + + # The upper bound for average number of layer visits below (8) + # was chosen empirically for this workload. + assert non_vectored_average < 8 + assert vectored_average < 8 + + +# Stripe sizes in number of pages. +TINY_STRIPES = 16 +LARGE_STRIPES = 32768 + + +@pytest.mark.parametrize( + "shard_count,stripe_size", [(None, None), (4, TINY_STRIPES), (4, LARGE_STRIPES)] +) +def test_sharding_compaction( + neon_env_builder: NeonEnvBuilder, stripe_size: int, shard_count: Optional[int] +): + """ + Use small stripes, small layers, and small compaction thresholds to exercise how compaction + and image layer generation interacts with sharding. + + We are looking for bugs that might emerge from the way sharding uses sparse layer files that + only contain some of the keys in the key range covered by the layer, such as errors estimating + the size of layers that might result in too-small layer files. + """ + + compaction_target_size = 128 * 1024 + + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{compaction_target_size}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly: we want to exercise image layer creation in this test. + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": 0, + } + + neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + initial_tenant_shard_stripe_size=stripe_size, + ) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(64) + for _i in range(0, 10): + # Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1, + # these should result in image layers each time we write some data into a shard, and also shards + # recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer, + # rather than asserting) + workload.churn_rows(64) + + # Assert that we got some image layers: this is important because this test's purpose is to exercise the sharding changes + # to Timeline::create_image_layers, so if we weren't creating any image layers we wouldn't be doing our job. + shard_has_image_layers = [] + for shard in env.storage_controller.locate(tenant_id): + pageserver = env.get_pageserver(shard["node_id"]) + shard_id = shard["shard_id"] + layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id) + image_layer_sizes = {} + for layer in layer_map.historic_layers: + if layer.kind == "Image": + image_layer_sizes[layer.layer_file_name] = layer.layer_file_size + + # Pageserver should assert rather than emit an empty layer file, but double check here + assert layer.layer_file_size is not None + assert layer.layer_file_size > 0 + + shard_has_image_layers.append(len(image_layer_sizes) > 1) + log.info(f"Shard {shard_id} image layer sizes: {json.dumps(image_layer_sizes, indent=2)}") + + if stripe_size == TINY_STRIPES: + # Checking the average size validates that our keyspace partitioning is properly respecting sharding: if + # it was not, we would tend to get undersized layers because the partitioning would overestimate the physical + # data in a keyrange. + # + # We only do this check with tiny stripes, because large stripes may not give all shards enough + # data to have statistically significant image layers + avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes) # type: ignore + log.info(f"Shard {shard_id} average image layer size: {avg_size}") + assert avg_size > compaction_target_size / 2 + + if stripe_size == TINY_STRIPES: + # Expect writes were scattered across all pageservers: they should all have compacted some image layers + assert all(shard_has_image_layers) + else: + # With large stripes, it is expected that most of our writes went to one pageserver, so we just require + # that at least one of them has some image layers. + assert any(shard_has_image_layers) + + # Assert that everything is still readable + workload.validate() diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 208263a22a..e1ccb3e0c6 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -192,9 +192,6 @@ def test_backward_compatibility( assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" -# Forward compatibility is broken due to https://github.com/neondatabase/neon/pull/6530 -# The test is disabled until the next release deployment -@pytest.mark.xfail @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") @@ -229,6 +226,12 @@ def test_forward_compatibility( ) try: + # Previous version neon_local and pageserver are not aware + # of the new config. + # TODO: remove these once the previous version of neon local supports them + neon_env_builder.pageserver_get_impl = None + neon_env_builder.pageserver_validate_vectored_get = None + neon_env_builder.num_safekeepers = 3 neon_local_binpath = neon_env_builder.neon_binpath env = neon_env_builder.from_repo_dir( diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index 01ecc2b95f..30f8d81890 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -19,6 +19,12 @@ from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_crafted_wal_end") + env.pageserver.allowed_errors.extend( + [ + # seems like pageserver stop triggers these + ".*initial size calculation failed.*Bad state (not active).*", + ] + ) endpoint = env.endpoints.create("test_crafted_wal_end") wal_craft = WalCraft(env) diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index ac3315b86f..179cc273ec 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -3,7 +3,7 @@ import re import time from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, tenant_get_shards, wait_replica_caughtup # Check for corrupted WAL messages which might otherwise go unnoticed if @@ -102,3 +102,80 @@ def test_2_replicas_start(neon_simple_env: NeonEnv): ) as secondary2: wait_replica_caughtup(primary, secondary1) wait_replica_caughtup(primary, secondary2) + + +# We had an issue that a standby server made GetPage requests with an +# old LSN, based on the last-written LSN cache, to avoid waits in the +# pageserver. However, requesting a page with a very old LSN, such +# that the GC horizon has already advanced past it, results in an +# error from the pageserver: +# "Bad request: tried to request a page version that was garbage collected" +# +# To avoid that, the compute<-> pageserver protocol was updated so +# that that the standby now sends two LSNs, the old last-written LSN +# and the current replay LSN. +# +# https://github.com/neondatabase/neon/issues/6211 +def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder): + tenant_conf = { + # set PITR interval to be small, so we can do GC + "pitr_interval": "0 s", + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + timeline_id = env.initial_timeline + tenant_id = env.initial_tenant + + with env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) as primary: + with env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + # Protocol version 2 was introduced to fix the issue + # that this test exercises. With protocol version 1 it + # fails. + config_lines=["neon.protocol_version=2"], + ) as secondary: + p_cur = primary.connect().cursor() + p_cur.execute("CREATE EXTENSION neon_test_utils") + p_cur.execute("CREATE TABLE test (id int primary key) WITH (autovacuum_enabled=false)") + p_cur.execute("INSERT INTO test SELECT generate_series(1, 10000) AS g") + + wait_replica_caughtup(primary, secondary) + + s_cur = secondary.connect().cursor() + + s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()") + res = s_cur.fetchone() + assert res is not None + + s_cur.execute("SELECT COUNT(*) FROM test") + res = s_cur.fetchone() + assert res[0] == 10000 + + # Clear the cache in the standby, so that when we + # re-execute the query, it will make GetPage + # requests. This does not clear the last-written LSN cache + # so we still remember the LSNs of the pages. + s_cur.execute("SELECT clear_buffer_cache()") + + # Do other stuff on the primary, to advance the WAL + p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g") + + # Run GC. The PITR interval is very small, so this advances the GC cutoff LSN + # very close to the primary's current insert LSN. + shards = tenant_get_shards(env, tenant_id, None) + for tenant_shard_id, pageserver in shards: + client = pageserver.http_client() + client.timeline_checkpoint(tenant_shard_id, timeline_id) + client.timeline_compact(tenant_shard_id, timeline_id) + client.timeline_gc(tenant_shard_id, timeline_id, 0) + + # Re-execute the query. The GetPage requests that this + # generates use old not_modified_since LSNs, older than + # the GC cutoff, but new request LSNs. (In protocol + # version 1 there was only one LSN, and this failed.) + s_cur.execute("SELECT COUNT(*) FROM test") + res = s_cur.fetchone() + assert res[0] == 10000 diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index 38f2034c18..76c6581448 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -4,16 +4,21 @@ import threading import time from typing import List -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder from fixtures.utils import query_scalar -def test_local_file_cache_unlink(neon_simple_env: NeonEnv): - env = neon_simple_env +def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: str): + if build_type == "debug": + # Disable vectored read path cross validation since it makes the test time out. + neon_env_builder.pageserver_config_override = "validate_vectored_get=false" + + env = neon_env_builder.init_start() cache_dir = os.path.join(env.repo_dir, "file_cache") os.mkdir(cache_dir) + env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) env.neon_cli.create_branch("test_local_file_cache_unlink", "empty") endpoint = env.endpoints.create_start( diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 5813231aab..5c99ca6733 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -1,3 +1,4 @@ +import re import time from datetime import datetime, timedelta, timezone @@ -109,6 +110,8 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Test pageserver get_timestamp_of_lsn API def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): + key_not_found_error = r".*could not find data for key.*" + env = neon_env_builder.init_start() new_timeline_id = env.neon_cli.create_branch("test_ts_of_lsn_api") @@ -177,8 +180,8 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): raise RuntimeError("there should have been an 'could not find data for key' error") except PageserverApiException as error: assert error.status_code == 500 - assert str(error).startswith("could not find data for key") - env.pageserver.allowed_errors.append(".*could not find data for key.*") + assert re.match(key_not_found_error, str(error)) + env.pageserver.allowed_errors.append(key_not_found_error) # Probe a bunch of timestamps in the valid range step_size = 100 diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index ba0d53704b..6c2556f6a2 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -333,6 +333,17 @@ def test_download_remote_layers_api( } ) + # This test triggers layer download failures on demand. It is possible to modify the failpoint + # during a `Timeline::get_vectored` right between the vectored read and it's validation read. + # This means that one of the reads can fail while the other one succeeds and vice versa. + # TODO(vlad): Remove this block once the vectored read path validation goes away. + env.pageserver.allowed_errors.extend( + [ + ".*initial_size_calculation.*Vectored get failed with downloading evicted layer file failed, but sequential get did not.*" + ".*initial_size_calculation.*Sequential get failed with downloading evicted layer file failed, but vectored get did not.*" + ] + ) + endpoint = env.endpoints.create_start("main") client = env.pageserver.http_client() diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py new file mode 100644 index 0000000000..0b36b32552 --- /dev/null +++ b/test_runner/regress/test_ondemand_slru_download.py @@ -0,0 +1,131 @@ +from typing import Optional + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, tenant_get_shards +from fixtures.types import Lsn +from fixtures.utils import query_scalar + + +# +# Test on-demand download of the pg_xact SLRUs +# +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + + tenant_conf = { + "lazy_slru_download": "true", + # set PITR interval to be small, so we can do GC + "pitr_interval": "0 s", + } + env = neon_env_builder.init_start( + initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count + ) + + timeline_id = env.initial_timeline + tenant_id = env.initial_tenant + endpoint = env.endpoints.create_start("main") + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("CREATE EXTENSION neon_test_utils") + + # Create a test table + cur.execute("CREATE TABLE clogtest (id integer)") + cur.execute("INSERT INTO clogtest VALUES (1)") + + # Consume a lot of XIDs, to create more pg_xact segments + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (2)") + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (2)") + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (3)") + + # Restart postgres. After restart, the new instance will download the + # pg_xact segments lazily. + endpoint.stop() + endpoint.start() + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Consume more WAL, so that the pageserver can compact and GC older data, + # including the LSN that we started the new endpoint at, + cur.execute("CREATE TABLE anothertable (i int, t text)") + cur.execute( + "INSERT INTO anothertable SELECT g, 'long string to consume some space' || g FROM generate_series(1, 10000) g" + ) + + # Run GC + shards = tenant_get_shards(env, tenant_id, None) + for tenant_shard_id, pageserver in shards: + client = pageserver.http_client() + client.timeline_checkpoint(tenant_shard_id, timeline_id) + client.timeline_compact(tenant_shard_id, timeline_id) + client.timeline_gc(tenant_shard_id, timeline_id, 0) + + # Test that this can still on-demand download the old pg_xact segments + cur.execute("select xmin, xmax, * from clogtest") + tup = cur.fetchall() + log.info(f"tuples = {tup}") + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + + tenant_conf = { + "lazy_slru_download": "true", + } + env = neon_env_builder.init_start( + initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count + ) + + endpoint = env.endpoints.create_start("main") + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("CREATE EXTENSION neon_test_utils") + + # Create a test table + cur.execute("CREATE TABLE clogtest (id integer)") + cur.execute("INSERT INTO clogtest VALUES (1)") + + # Consume a lot of XIDs, to create more pg_xact segments + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + + # Open a new connection and insert another row, but leave + # the transaction open + pg_conn2 = endpoint.connect() + cur2 = pg_conn2.cursor() + cur2.execute("BEGIN") + cur2.execute("INSERT INTO clogtest VALUES (2)") + + # Another insert on the first connection, which is committed. + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (3)") + + # Start standby at this point in time + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) + endpoint_at_lsn = env.endpoints.create_start( + branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn + ) + + # Commit transaction 2, after the standby was launched. + cur2.execute("COMMIT") + + # The replica should not see transaction 2 as committed. + conn_replica = endpoint_at_lsn.connect() + cur_replica = conn_replica.cursor() + cur_replica.execute("SELECT * FROM clogtest") + assert cur_replica.fetchall() == [(1,), (3,)] diff --git a/test_runner/regress/test_pageserver_config.py b/test_runner/regress/test_pageserver_config.py new file mode 100644 index 0000000000..c04348b488 --- /dev/null +++ b/test_runner/regress/test_pageserver_config.py @@ -0,0 +1,35 @@ +import pytest +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + last_flush_lsn_upload, +) + + +@pytest.mark.parametrize("kind", ["sync", "async"]) +def test_walredo_process_kind_config(neon_env_builder: NeonEnvBuilder, kind: str): + neon_env_builder.pageserver_config_override = f"walredo_process_kind = '{kind}'" + # ensure it starts + env = neon_env_builder.init_start() + # ensure the metric is set + ps_http = env.pageserver.http_client() + metrics = ps_http.get_metrics() + samples = metrics.query_all("pageserver_wal_redo_process_kind") + assert [(s.labels, s.value) for s in samples] == [({"kind": kind}, 1)] + # ensure default tenant's config kind matches + # => write some data to force-spawn walredo + ep = env.endpoints.create_start("main") + with ep.connect() as conn: + with conn.cursor() as cur: + cur.execute("create table foo(bar text)") + cur.execute("insert into foo select from generate_series(1, 100)") + last_flush_lsn_upload(env, ep, env.initial_tenant, env.initial_timeline) + ep.stop() + ep.start() + with ep.connect() as conn: + with conn.cursor() as cur: + cur.execute("select count(*) from foo") + [(count,)] = cur.fetchall() + assert count == 100 + + status = ps_http.tenant_status(env.initial_tenant) + assert status["walredo"]["process"]["kind"] == kind diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index c7e1e88468..c5dc0f2919 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -1,6 +1,7 @@ import asyncio import os -from typing import Tuple +import time +from typing import Optional, Tuple import psutil import pytest @@ -20,20 +21,30 @@ ENTRIES_PER_TIMELINE = 10_000 CHECKPOINT_TIMEOUT_SECONDS = 60 -async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: - tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf) +async def run_worker_for_tenant( + env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None +) -> Lsn: + if offset is None: + offset = 0 + with env.endpoints.create_start("main", tenant_id=tenant) as ep: conn = await ep.connect_async() try: await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)") await conn.execute( - f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i" + f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series({offset},{entries}) as i" ) finally: await conn.close(timeout=10) last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - return tenant, timeline, last_flush_lsn + return last_flush_lsn + + +async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: + tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf) + last_flush_lsn = await run_worker_for_tenant(env, entries, tenant) + return tenant, timeline, last_flush_lsn async def workload( @@ -89,7 +100,9 @@ def assert_dirty_bytes(env, v): def assert_dirty_bytes_nonzero(env): - assert get_dirty_bytes(env) > 0 + dirty_bytes = get_dirty_bytes(env) + assert dirty_bytes > 0 + return dirty_bytes @pytest.mark.parametrize("immediate_shutdown", [True, False]) @@ -182,6 +195,31 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): log.info("Waiting for background checkpoints...") wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + # The code below verifies that we do not flush on the first write + # after an idle period longer than the checkpoint timeout. + + # Sit quietly for longer than the checkpoint timeout + time.sleep(CHECKPOINT_TIMEOUT_SECONDS + CHECKPOINT_TIMEOUT_SECONDS / 2) + + # Restart the safekeepers and write a bit of extra data into one tenant + for sk in env.safekeepers: + sk.start() + + tenant_with_extra_writes = last_flush_lsns[0][0] + asyncio.run( + run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE) + ) + + dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + + # We shouldn't flush since we've just opened a new layer + waited_for = 0 + while waited_for < CHECKPOINT_TIMEOUT_SECONDS // 4: + time.sleep(5) + waited_for += 5 + + assert get_dirty_bytes(env) >= dirty_after_write + @pytest.mark.skipif( # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 345abdc072..8f194e5dda 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -1,6 +1,7 @@ import json import os import random +import time from pathlib import Path from typing import Any, Dict, Optional @@ -582,6 +583,91 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ) +def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): + """ + Slow test that runs in realtime, checks that the background scheduling of secondary + downloads happens as expected. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + # Create this many tenants, each with two timelines + tenant_count = 4 + tenant_timelines = {} + + # This mirrors a constant in `downloader.rs` + freshen_interval_secs = 60 + + for _i in range(0, tenant_count): + tenant_id = TenantId.generate() + timeline_a = TimelineId.generate() + timeline_b = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_a, + placement_policy='{"Attached":1}', + # Run with a low heatmap period so that we can avoid having to do synthetic API calls + # to trigger the upload promptly. + conf={"heatmap_period": "1s"}, + ) + env.neon_cli.create_timeline("main2", tenant_id, timeline_b) + + tenant_timelines[tenant_id] = [timeline_a, timeline_b] + + t_start = time.time() + + # Wait long enough that the background downloads should happen; we expect all the inital layers + # of all the initial timelines to show up on the secondary location of each tenant. + time.sleep(freshen_interval_secs * 1.5) + + for tenant_id, timelines in tenant_timelines.items(): + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + # We only have two: the other one must be secondary + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + for timeline_id in timelines: + log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}") + # One or more layers should be present for all timelines + assert list_layers(ps_secondary, tenant_id, timeline_id) + + # Delete the second timeline: this should be reflected later on the secondary + env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1]) + + # Wait long enough for the secondary locations to see the deletion + time.sleep(freshen_interval_secs * 1.5) + + for tenant_id, timelines in tenant_timelines.items(): + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + # We only have two: the other one must be secondary + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + # This one was not deleted + assert list_layers(ps_secondary, tenant_id, timelines[0]) + + # This one was deleted + assert not list_layers(ps_secondary, tenant_id, timelines[1]) + + t_end = time.time() + + # Measure how many heatmap downloads we did in total: this checks that we succeeded with + # proper scheduling, and not some bug that just runs downloads in a loop. + total_heatmap_downloads = 0 + for ps in env.pageservers: + v = ps.http_client().get_metric_value("pageserver_secondary_download_heatmap_total") + assert v is not None + total_heatmap_downloads += int(v) + + download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start) + + expect_download_rate = 1.0 / freshen_interval_secs + log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min") + + assert download_rate < expect_download_rate * 2 + + @pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") @pytest.mark.parametrize("via_controller", [True, False]) def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool): diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index e4219ec7a6..2b1b7fff34 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -18,6 +18,7 @@ from fixtures.remote_storage import s3_storage def test_pg_regress( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, + build_type: str, pg_bin, capsys, base_dir: Path, @@ -30,6 +31,11 @@ def test_pg_regress( """ if shard_count is not None: neon_env_builder.num_pageservers = shard_count + + if build_type == "debug": + # Disable vectored read path cross validation since it makes the test time out. + neon_env_builder.pageserver_config_override = "validate_vectored_get=false" + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.enable_scrub_on_exit() env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py new file mode 100644 index 0000000000..8e80efd9ba --- /dev/null +++ b/test_runner/regress/test_pg_waldump.py @@ -0,0 +1,61 @@ +import os +import shutil + +from fixtures.neon_fixtures import NeonEnv, PgBin +from fixtures.utils import subprocess_capture + + +def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir): + # use special --ignore option to ignore the validation checks in pg_waldump + # this is necessary, because neon WAL files contain gap at the beginning + output_path, _, _ = subprocess_capture( + test_output_dir, [pg_waldump_path, "--ignore", segment_path] + ) + + with open(f"{output_path}.stdout", "r") as f: + stdout = f.read() + assert "ABORT" in stdout + assert "COMMIT" in stdout + + +# Simple test to check that pg_waldump works with neon WAL files +def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin): + env = neon_simple_env + tenant_id = env.initial_tenant + timeline_id = env.neon_cli.create_branch("test_pg_waldump", "empty") + endpoint = env.endpoints.create_start("test_pg_waldump") + + cur = endpoint.connect().cursor() + cur.execute( + """ + BEGIN; + CREATE TABLE t1(i int primary key, n_updated int); + INSERT INTO t1 select g, 0 from generate_series(1, 50) g; + ROLLBACK; + """ + ) + + cur.execute( + """ + BEGIN; + CREATE TABLE t1(i int primary key, n_updated int); + INSERT INTO t1 select g, 0 from generate_series(1, 50) g; + COMMIT; + """ + ) + + # stop the endpoint to make sure that WAL files are flushed and won't change + endpoint.stop() + + assert endpoint.pgdata_dir + wal_path = os.path.join(endpoint.pgdata_dir, "pg_wal/000000010000000000000001") + pg_waldump_path = os.path.join(pg_bin.pg_bin_path, "pg_waldump") + # check segment on compute + check_wal_segment(pg_waldump_path, wal_path, test_output_dir) + + # Check file on safekeepers as well. pg_waldump is strict about file naming, so remove .partial suffix. + sk = env.safekeepers[0] + sk_tli_dir = sk.timeline_dir(tenant_id, timeline_id) + non_partial_path = os.path.join(sk_tli_dir, "000000010000000000000001") + shutil.copyfile(os.path.join(sk_tli_dir, "000000010000000000000001.partial"), non_partial_path) + check_wal_segment(pg_waldump_path, non_partial_path, test_output_dir) diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py deleted file mode 100644 index f39f0cad07..0000000000 --- a/test_runner/regress/test_proxy_rate_limiter.py +++ /dev/null @@ -1,84 +0,0 @@ -import asyncio -import time -from pathlib import Path -from typing import Iterator - -import pytest -from fixtures.neon_fixtures import ( - PSQL, - NeonProxy, -) -from fixtures.port_distributor import PortDistributor -from pytest_httpserver import HTTPServer -from werkzeug.wrappers.response import Response - - -def waiting_handler(status_code: int) -> Response: - # wait more than timeout to make sure that both (two) connections are open. - # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver. - time.sleep(2) - return Response(status=status_code) - - -@pytest.fixture(scope="function") -def proxy_with_rate_limit( - port_distributor: PortDistributor, - neon_binpath: Path, - httpserver_listen_address, - test_output_dir: Path, -) -> Iterator[NeonProxy]: - """Neon proxy that routes directly to vanilla postgres.""" - - proxy_port = port_distributor.get_port() - mgmt_port = port_distributor.get_port() - http_port = port_distributor.get_port() - external_http_port = port_distributor.get_port() - (host, port) = httpserver_listen_address - endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" - - with NeonProxy( - neon_binpath=neon_binpath, - test_output_dir=test_output_dir, - proxy_port=proxy_port, - http_port=http_port, - mgmt_port=mgmt_port, - external_http_port=external_http_port, - auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5), - ) as proxy: - proxy.start() - yield proxy - - -@pytest.mark.asyncio -async def test_proxy_rate_limit( - httpserver: HTTPServer, - proxy_with_rate_limit: NeonProxy, -): - uri = "/billing/api/v1/usage_events/proxy_get_role_secret" - # mock control plane service - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: Response(status=200) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(429) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(500) - ) - - psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port) - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - # Limit should be 2. - - # Run two queries in parallel. - f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;")) - await proxy_with_rate_limit.find_auth_link(uri, f1) - await proxy_with_rate_limit.find_auth_link(uri, f2) - - # Now limit should be 0. - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - - # There last query shouldn't reach the http-server. - assert httpserver.assertions == [] diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index 868b80a561..2437c8f806 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -17,7 +17,14 @@ def test_read_validation(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_read_validation", "empty") - endpoint = env.endpoints.create_start("test_read_validation") + endpoint = env.endpoints.create_start( + "test_read_validation", + # Use protocol version 2, because the code that constructs the V1 messages + # assumes that a primary always wants to read the latest version of a page, + # and therefore doesn't work with the test functions below to read an older + # page version. + config_lines=["neon.protocol_version=2"], + ) with closing(endpoint.connect()) as con: with con.cursor() as c: @@ -64,7 +71,7 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Cache is clear, reading stale page version") c.execute( - f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}'))" + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}', NULL))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn" @@ -77,7 +84,7 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Cache is clear, reading latest page version without cache") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL, NULL))" ) direct_latest = c.fetchone() assert second == direct_latest, "Failed fetch page at latest lsn" @@ -92,7 +99,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))" + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" @@ -102,7 +109,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL))" + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL, NULL))" ) direct_latest = c.fetchone() assert second == direct_latest, "Failed fetch page at latest lsn" @@ -114,7 +121,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))" + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" @@ -133,7 +140,14 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*") - endpoint = env.endpoints.create_start("test_read_validation_neg") + endpoint = env.endpoints.create_start( + "test_read_validation_neg", + # Use protocol version 2, because the code that constructs the V1 messages + # assumes that a primary always wants to read the latest version of a page, + # and therefore doesn't work with the test functions below to read an older + # page version. + config_lines=["neon.protocol_version=2"], + ) with closing(endpoint.connect()) as con: with con.cursor() as c: @@ -143,7 +157,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): log.info("read a page of a missing relation") try: c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0', NULL))" ) raise AssertionError("query should have failed") except UndefinedTable as e: @@ -155,7 +169,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): log.info("read a page at lsn 0") try: c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0', NULL))" ) raise AssertionError("query should have failed") except IoError as e: @@ -164,22 +178,22 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): log.info("Pass NULL as an input") expected = (None, None, None) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0', NULL))" ) assert c.fetchone() == expected, "Expected null output" c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0', NULL))" ) assert c.fetchone() == expected, "Expected null output" c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0', NULL))" ) assert c.fetchone() == expected, "Expected null output" # This check is currently failing, reading beyond EOF is returning a 0-page log.info("Read beyond EOF") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL, NULL))" ) diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_s3_scrubber.py new file mode 100644 index 0000000000..018c1637d0 --- /dev/null +++ b/test_runner/regress/test_s3_scrubber.py @@ -0,0 +1,111 @@ +import os +import shutil +from typing import Optional + +import pytest +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + S3Scrubber, +) +from fixtures.remote_storage import S3Storage, s3_storage +from fixtures.types import TenantShardId +from fixtures.workload import Workload + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + """ + Test the `tenant-snapshot` subcommand, which grabs data from remote storage + + This is only a support/debug tool, but worth testing to ensure the tool does not regress. + """ + + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1 + + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + branch = "main" + + # Do some work + workload = Workload(env, tenant_id, timeline_id, branch) + workload.init() + + # Multiple write/flush passes to generate multiple layers + for _n in range(0, 3): + workload.write_rows(128) + + # Do some more work after a restart, so that we have multiple generations + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + for _n in range(0, 3): + workload.write_rows(128) + + # If we're doing multiple shards, split: this is important to exercise + # the scrubber's ability to understand the references from child shards to parent shard's layers + if shard_count is not None: + tenant_shard_ids = env.storage_controller.tenant_shard_split( + tenant_id, shard_count=shard_count + ) + + # Write after shard split: this will result in shards containing a mixture of owned + # and parent layers in their index. + workload.write_rows(128) + else: + tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)] + + output_path = neon_env_builder.test_output_dir / "snapshot" + os.makedirs(output_path) + + scrubber = S3Scrubber(neon_env_builder) + scrubber.tenant_snapshot(tenant_id, output_path) + + assert len(os.listdir(output_path)) > 0 + + workload.stop() + + # Stop pageservers + for pageserver in env.pageservers: + pageserver.stop() + + # Drop all shards' local storage + for tenant_shard_id in tenant_shard_ids: + pageserver = env.get_tenant_pageserver(tenant_shard_id) + shutil.rmtree(pageserver.timeline_dir(tenant_shard_id, timeline_id)) + + # Replace remote storage contents with the snapshot we downloaded + assert isinstance(env.pageserver_remote_storage, S3Storage) + + remote_tenant_path = env.pageserver_remote_storage.tenant_path(tenant_id) + + # Delete current remote storage contents + bucket = env.pageserver_remote_storage.bucket_name + remote_client = env.pageserver_remote_storage.client + deleted = 0 + for object in remote_client.list_objects_v2(Bucket=bucket, Prefix=remote_tenant_path)[ + "Contents" + ]: + key = object["Key"] + remote_client.delete_object(Key=key, Bucket=bucket) + deleted += 1 + assert deleted > 0 + + # Upload from snapshot + for root, _dirs, files in os.walk(output_path): + for file in files: + full_local_path = os.path.join(root, file) + full_remote_path = ( + env.pageserver_remote_storage.tenants_path() + + "/" + + full_local_path.removeprefix(f"{output_path}/") + ) + remote_client.upload_file(full_local_path, bucket, full_remote_path) + + for pageserver in env.pageservers: + pageserver.start() + + # Check we can read everything + workload.validate() diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index bfaab9125f..258377f8a2 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -287,6 +287,11 @@ def test_sharding_split_smoke( == shard_count ) + # Make secondary downloads slow: this exercises the storage controller logic for not migrating an attachment + # during post-split optimization until the secondary is ready + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count) post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)] @@ -300,7 +305,7 @@ def test_sharding_split_smoke( # Enough background reconciliations should result in the shards being properly distributed. # Run this before the workload, because its LSN-waiting code presumes stable locations. - env.storage_controller.reconcile_until_idle() + env.storage_controller.reconcile_until_idle(timeout_secs=60) workload.validate() @@ -342,6 +347,10 @@ def test_sharding_split_smoke( assert cancelled_reconciles is not None and int(cancelled_reconciles) == 0 assert errored_reconciles is not None and int(errored_reconciles) == 0 + # We should see that the migration of shards after the split waited for secondaries to warm up + # before happening + assert env.storage_controller.log_contains(".*Skipping.*because secondary isn't ready.*") + env.storage_controller.consistency_check() def get_node_shard_counts(env: NeonEnv, tenant_ids): @@ -928,6 +937,8 @@ def test_sharding_split_failures( ".*Reconcile error: receive body: error sending request for url.*", # Node offline cases will fail inside reconciler when detaching secondaries ".*Reconcile error on shard.*: receive body: error sending request for url.*", + # Node offline cases may eventually cancel reconcilers when the heartbeater realizes nodes are offline + ".*Reconcile error.*Cancelled.*", # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*", ] @@ -1069,6 +1080,17 @@ def test_sharding_split_failures( finish_split() assert_split_done() + if isinstance(failure, StorageControllerFailpoint) and "post-complete" in failure.failpoint: + # On a post-complete failure, the controller will recover the post-split state + # after restart, but it will have missed the optimization part of the split function + # where secondary downloads are kicked off. This means that reconcile_until_idle + # will take a very long time if we wait for all optimizations to complete, because + # those optimizations will wait for secondary downloads. + # + # Avoid that by configuring the tenant into Essential scheduling mode, so that it will + # skip optimizations when we're exercising this particular failpoint. + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Essential"}) + # Having completed the split, pump the background reconciles to ensure that # the scheduler reaches an idle state env.storage_controller.reconcile_until_idle(timeout_secs=30) @@ -1201,3 +1223,45 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos) diff = max_lsn - min_lsn assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure" + + +def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder): + """ + Check that an unlogged relation is handled properly on a sharded tenant + + Reproducer for https://github.com/neondatabase/neon/issues/7451 + """ + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + neon_env_builder.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=8) + + # We will create many tables to ensure it's overwhelmingly likely that at least one + # of them doesn't land on shard 0 + table_names = [f"my_unlogged_{i}" for i in range(0, 16)] + + with env.endpoints.create_start("main", tenant_id=tenant_id) as ep: + for table_name in table_names: + ep.safe_psql(f"CREATE UNLOGGED TABLE {table_name} (id integer, value varchar(64));") + ep.safe_psql(f"INSERT INTO {table_name} VALUES (1, 'foo')") + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [(1, "foo")] + ep.safe_psql(f"CREATE INDEX ON {table_name} USING btree (value);") + + wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) + + with env.endpoints.create_start("main", tenant_id=tenant_id) as ep: + for table_name in table_names: + # Check that table works: we can select and insert + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [] + ep.safe_psql(f"INSERT INTO {table_name} VALUES (2, 'bar');") + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [(2, "bar")] + + # Ensure that post-endpoint-restart modifications are ingested happily by pageserver + wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 840f354142..fdcb4cf9a4 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -1,4 +1,5 @@ import json +import threading import time from collections import defaultdict from datetime import datetime, timezone @@ -26,6 +27,7 @@ from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.types import TenantId, TenantShardId, TimelineId from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until +from fixtures.workload import Workload from mypy_boto3_s3.type_defs import ( ObjectTypeDef, ) @@ -228,6 +230,10 @@ def test_storage_controller_passthrough( } assert status["state"]["slug"] == "Active" + (synthetic_size, size_inputs) = client.tenant_size_and_modelinputs(env.initial_tenant) + assert synthetic_size > 0 + assert "segments" in size_inputs + env.storage_controller.consistency_check() @@ -273,7 +279,8 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up but imports the generation number. """ - neon_env_builder.num_pageservers = 2 + # One pageserver to simulate legacy environment, two to be managed by storage controller + neon_env_builder.num_pageservers = 3 # Start services by hand so that we can skip registration on one of the pageservers env = neon_env_builder.init_configs() @@ -288,10 +295,10 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up ) origin_ps = env.pageservers[0] - # This is the pageserver managed by the sharding service, where the tenant + # These are the pageservers managed by the sharding service, where the tenant # will be attached after onboarding env.pageservers[1].start() - dest_ps = env.pageservers[1] + env.pageservers[2].start() virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) for sk in env.safekeepers: @@ -330,6 +337,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up ) virtual_ps_http.tenant_secondary_download(tenant_id) + warm_up_ps = env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "node_secondary" + ][0] # Call into storage controller to onboard the tenant generation += 1 @@ -344,6 +354,18 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up ) assert len(r["shards"]) == 1 + describe = env.storage_controller.tenant_describe(tenant_id)["shards"][0] + dest_ps_id = describe["node_attached"] + dest_ps = env.get_pageserver(dest_ps_id) + if warm_up: + # The storage controller should have attached the tenant to the same placce + # it had a secondary location, otherwise there was no point warming it up + assert dest_ps_id == warm_up_ps + + # It should have been given a new secondary location as well + assert len(describe["node_secondary"]) == 1 + assert describe["node_secondary"][0] != warm_up_ps + # As if doing a live migration, detach the original pageserver origin_ps.http_client().tenant_location_conf( tenant_id, @@ -415,6 +437,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"] ) dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id) + + # Storage controller auto-sets heatmap period, ignore it for the comparison + del dest_tenant_conf_after.tenant_specific_overrides["heatmap_period"] assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf env.storage_controller.consistency_check() @@ -1237,3 +1262,132 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): # Quiesce any background reconciliation before doing consistency check env.storage_controller.reconcile_until_idle(timeout_secs=10) env.storage_controller.consistency_check() + + +def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder): + """ + Check that when lock on resource (tenants, nodes) is held for too long it is + traced in logs. + """ + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + env.storage_controller.allowed_errors.extend( + [ + ".*Lock on.*", + ".*Scheduling is disabled by policy.*", + f".*Operation TimelineCreate on key {tenant_id} has waited.*", + ] + ) + + # Apply failpoint + env.storage_controller.configure_failpoints( + ("tenant-update-policy-exclusive-lock", "return(31000)") + ) + + # This will hold the exclusive for enough time to cause an warning + def update_tenent_policy(): + env.storage_controller.tenant_policy_update( + tenant_id=tenant_id, + body={ + "scheduling": "Stop", + }, + ) + + thread_update_tenant_policy = threading.Thread(target=update_tenent_policy) + thread_update_tenant_policy.start() + + # Make sure the update policy thread has started + time.sleep(1) + # This will not be able to access and will log a warning + timeline_id = TimelineId.generate() + env.storage_controller.pageserver_api().timeline_create( + pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id + ) + thread_update_tenant_policy.join(timeout=10) + + env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for") + env.storage_controller.assert_log_contains( + f"Operation TimelineCreate on key {tenant_id} has waited" + ) + + +@pytest.mark.parametrize("remote_storage", [RemoteStorageKind.LOCAL_FS, s3_storage()]) +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_storage): + """ + Tenant import is a support/debug tool for recovering a tenant from remote storage + if we don't have any metadata for it in the storage controller. + """ + + # This test is parametrized on remote storage because it exercises the relatively rare + # code path of listing with a prefix that is not a directory name: this helps us notice + # quickly if local_fs or s3_bucket implementations diverge. + neon_env_builder.enable_pageserver_remote_storage(remote_storage) + + # Use multiple pageservers because some test helpers assume single sharded tenants + # if there is only one pageserver. + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + tenant_id = env.initial_tenant + + # Create a second timeline to ensure that import finds both + timeline_a = env.initial_timeline + timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id) + + workload_a = Workload(env, tenant_id, timeline_a, branch_name="main") + workload_a.init() + + workload_b = Workload(env, tenant_id, timeline_b, branch_name="branch_b") + workload_b.init() + + # Write some data + workload_a.write_rows(72) + expect_rows_a = workload_a.expect_rows + workload_a.stop() + del workload_a + + # Bump generation to make sure generation recovery works properly + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + # Write some data in the higher generation into the other branch + workload_b.write_rows(107) + expect_rows_b = workload_b.expect_rows + workload_b.stop() + del workload_b + + # Detach from pageservers + env.storage_controller.tenant_policy_update( + tenant_id, + { + "placement": "Detached", + }, + ) + env.storage_controller.reconcile_until_idle(timeout_secs=10) + + # Force-drop it from the storage controller + env.storage_controller.request( + "POST", + f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + + # Now import it again + env.neon_cli.import_tenant(tenant_id) + + # Check we found the shards + describe = env.storage_controller.tenant_describe(tenant_id) + literal_shard_count = 1 if shard_count is None else shard_count + assert len(describe["shards"]) == literal_shard_count + + # Check the data is still there: this implicitly proves that we recovered generation numbers + # properly, for the timeline which was written to after a generation bump. + for timeline, branch, expect_rows in [ + (timeline_a, "main", expect_rows_a), + (timeline_b, "branch_1", expect_rows_b), + ]: + workload = Workload(env, tenant_id, timeline, branch_name=branch) + workload.expect_rows = expect_rows + workload.validate() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index a164c7f60a..c115c0375b 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -469,7 +469,8 @@ def test_tenant_delete_concurrent( ): """ Validate that concurrent delete requests to the same tenant behave correctly: - exactly one should succeed. + exactly one should execute: the rest should give 202 responses but not start + another deletion. This is a reproducer for https://github.com/neondatabase/neon/issues/5936 """ @@ -484,14 +485,10 @@ def test_tenant_delete_concurrent( run_pg_bench_small(pg_bin, endpoint.connstr()) last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) - CONFLICT_MESSAGE = "Precondition failed: Invalid state Stopping. Expected Active or Broken" - env.pageserver.allowed_errors.extend( [ # lucky race with stopping from flushing a layer we fail to schedule any uploads ".*layer flush task.+: could not flush frozen layer: update_metadata_file", - # Errors logged from our 4xx requests - f".*{CONFLICT_MESSAGE}.*", ] ) @@ -507,7 +504,7 @@ def test_tenant_delete_concurrent( return ps_http.tenant_delete(tenant_id) def hit_remove_failpoint(): - env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}") + return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1] def hit_run_failpoint(): env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}") @@ -518,11 +515,14 @@ def test_tenant_delete_concurrent( # Wait until the first request completes its work and is blocked on removing # the TenantSlot from tenant manager. - wait_until(100, 0.1, hit_remove_failpoint) + log_cursor = wait_until(100, 0.1, hit_remove_failpoint) + assert log_cursor is not None - # Start another request: this should fail when it sees a tenant in Stopping state - with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE): - ps_http.tenant_delete(tenant_id) + # Start another request: this should succeed without actually entering the deletion code + ps_http.tenant_delete(tenant_id) + assert not env.pageserver.log_contains( + f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor + ) # Start another background request, which will pause after acquiring a TenantSlotGuard # but before completing. @@ -539,8 +539,10 @@ def test_tenant_delete_concurrent( # Permit the duplicate background request to run to completion and fail. ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off")) - with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE): - background_4xx_req.result(timeout=10) + background_4xx_req.result(timeout=10) + assert not env.pageserver.log_contains( + f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor + ) # Physical deletion should have happened assert_prefix_empty( diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index d3f24cb06e..0ba0108651 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -132,7 +132,7 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): assert query_scalar(cur, "SELECT count(*) FROM t") == 100000 # Check that we had to retry the downloads - assert env.pageserver.log_contains(".*list timelines.*failed, will retry.*") + assert env.pageserver.log_contains(".*list identifiers.*failed, will retry.*") assert env.pageserver.log_contains(".*download.*failed, will retry.*") diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 9def3ad1c2..68d9d9a660 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -16,7 +16,6 @@ from fixtures.pageserver.utils import ( wait_for_upload, wait_tenant_status_404, ) -from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( LocalFsStorage, RemoteStorageKind, @@ -24,7 +23,6 @@ from fixtures.remote_storage import ( from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( query_scalar, - subprocess_capture, wait_until, ) @@ -184,20 +182,14 @@ def post_migration_check(endpoint: Endpoint, sum_before_migration: int, old_loca # A minor migration involves no storage breaking changes. # It is done by attaching the tenant to a new pageserver. "minor", - # A major migration involves exporting a postgres datadir - # basebackup and importing it into the new pageserver. - # This kind of migration can tolerate breaking changes - # to storage format - "major", + # In the unlikely and unfortunate event that we have to break + # the storage format, extend this test with the param below. + # "major", ], ) @pytest.mark.parametrize("with_load", ["with_load", "without_load"]) def test_tenant_relocation( neon_env_builder: NeonEnvBuilder, - port_distributor: PortDistributor, - test_output_dir: Path, - neon_binpath: Path, - base_dir: Path, method: str, with_load: str, ): @@ -299,40 +291,7 @@ def test_tenant_relocation( current_lsn=current_lsn_second, ) - # Migrate either by attaching from s3 or import/export basebackup - if method == "major": - cmd = [ - "poetry", - "run", - "python", - str(base_dir / "scripts/export_import_between_pageservers.py"), - "--tenant-id", - str(tenant_id), - "--from-host", - "localhost", - "--from-http-port", - str(origin_http.port), - "--from-pg-port", - str(origin_ps.service_port.pg), - "--to-host", - "localhost", - "--to-http-port", - str(destination_http.port), - "--to-pg-port", - str(destination_ps.service_port.pg), - "--pg-distrib-dir", - str(neon_env_builder.pg_distrib_dir), - "--work-dir", - str(test_output_dir), - "--tmp-pg-port", - str(port_distributor.get_port()), - ] - subprocess_capture(test_output_dir, cmd, check=True) - - destination_ps.allowed_errors.append( - ".*ignored .* unexpected bytes after the tar archive.*" - ) - elif method == "minor": + if method == "minor": # call to attach timeline to new pageserver destination_ps.tenant_attach(tenant_id) diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 4c8fd4b0e5..53da548524 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -1,4 +1,5 @@ import os +from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import List, Tuple @@ -11,13 +12,15 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, wait_for_wal_insert_lsn, ) -from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( + tenant_delete_wait_completed, timeline_delete_wait_completed, wait_until_tenant_active, ) from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import wait_until def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder): @@ -292,33 +295,12 @@ def test_single_branch_get_tenant_size_grows( Operate on single branch reading the tenants size after each transaction. """ - # Disable automatic gc and compaction. - # The pitr_interval here is quite problematic, so we cannot really use it. - # it'd have to be calibrated per test executing env. - - # there was a bug which was hidden if the create table and first batch of - # inserts is larger than gc_horizon. for example 0x20000 here hid the fact - # that there next_gc_cutoff could be smaller than initdb_lsn, which will - # obviously lead to issues when calculating the size. - gc_horizon = 0x3BA00 - - # it's a bit of a hack, but different versions of postgres have different - # amount of WAL generated for the same amount of data. so we need to - # adjust the gc_horizon accordingly. - if pg_version == PgVersion.V14: - gc_horizon = 0x4A000 - elif pg_version == PgVersion.V15: - gc_horizon = 0x3BA00 - elif pg_version == PgVersion.V16: - gc_horizon = 210000 - else: - raise NotImplementedError(pg_version) - + # Disable automatic compaction and GC, and set a long PITR interval: we will expect + # size to always increase with writes as all writes remain within the PITR tenant_config = { "compaction_period": "0s", "gc_period": "0s", - "pitr_interval": "0s", - "gc_horizon": gc_horizon, + "pitr_interval": "3600s", } env = neon_env_builder.init_start(initial_tenant_conf=tenant_config) @@ -332,18 +314,6 @@ def test_single_branch_get_tenant_size_grows( size_debug_file = open(test_output_dir / "size_debug.html", "w") - def check_size_change( - current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int - ): - if current_lsn - initdb_lsn >= gc_horizon: - assert ( - size >= prev_size - ), f"tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})" - else: - assert ( - size > prev_size - ), f"tenant_size should grow, because we continue to add WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})" - def get_current_consistent_size( env: NeonEnv, endpoint: Endpoint, @@ -412,14 +382,6 @@ def test_single_branch_get_tenant_size_grows( ) prev_size = collected_responses[-1][2] - - # branch start shouldn't be past gc_horizon yet - # thus the size should grow as we insert more data - # "gc_horizon" is tuned so that it kicks in _after_ the - # insert phase, but before the update phase ends. - assert ( - current_lsn - initdb_lsn <= gc_horizon - ), "Tuning of GC window is likely out-of-date" assert size > prev_size collected_responses.append(("INSERT", current_lsn, size)) @@ -439,8 +401,7 @@ def test_single_branch_get_tenant_size_grows( ) prev_size = collected_responses[-1][2] - - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) + assert size > prev_size collected_responses.append(("UPDATE", current_lsn, size)) @@ -457,8 +418,7 @@ def test_single_branch_get_tenant_size_grows( ) prev_size = collected_responses[-1][2] - - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) + assert size > prev_size collected_responses.append(("DELETE", current_lsn, size)) @@ -469,20 +429,20 @@ def test_single_branch_get_tenant_size_grows( with endpoint.cursor() as cur: cur.execute("DROP TABLE t0") - # Without setting a PITR interval, dropping the table doesn't reclaim any space - # from the user's point of view, because the DROP transaction is too small - # to fall out of gc_horizon. + # Dropping the table doesn't reclaim any space + # from the user's point of view, because the DROP transaction is still + # within pitr_interval. (current_lsn, size) = get_current_consistent_size( env, endpoint, size_debug_file, http_client, tenant_id, timeline_id ) - prev_size = collected_responses[-1][2] - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) + assert size >= prev_size + prev_size = size - # Set a tiny PITR interval to allow the DROP to impact the synthetic size + # Set a zero PITR interval to allow the DROP to impact the synthetic size # Because synthetic size calculation uses pitr interval when available, # when our tenant is configured with a tiny pitr interval, dropping a table should # cause synthetic size to go down immediately - tenant_config["pitr_interval"] = "1ms" + tenant_config["pitr_interval"] = "0s" env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config) (current_lsn, size) = get_current_consistent_size( env, endpoint, size_debug_file, http_client, tenant_id, timeline_id @@ -494,10 +454,6 @@ def test_single_branch_get_tenant_size_grows( # defined by gc_horizon. collected_responses.append(("DROP", current_lsn, size)) - # Should have gone past gc_horizon, otherwise gc_horizon is too large - bytes_written = current_lsn - initdb_lsn - assert bytes_written > gc_horizon - # this isn't too many lines to forget for a while. observed while # developing these tests that locally the value is a bit more than what we # get in the ci. @@ -663,6 +619,68 @@ def test_get_tenant_size_with_multiple_branches( size_debug_file.write(size_debug) +def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder): + """ + Makes sure synthetic size can still be calculated even if one of the + timelines is deleted or the tenant is deleted. + """ + + env = neon_env_builder.init_start() + failpoint = "Timeline::find_gc_cutoffs-pausable" + client = env.pageserver.http_client() + + orig_size = client.tenant_size(env.initial_tenant) + + branch_id = env.neon_cli.create_branch( + tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch" + ) + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + completion = exec.submit(client.tenant_size, env.initial_tenant) + _, last_offset = wait_until( + 10, 1.0, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + ) + + timeline_delete_wait_completed(client, env.initial_tenant, branch_id) + + client.configure_failpoints((failpoint, "off")) + size = completion.result() + + assert_size_approx_equal(orig_size, size) + + branch_id = env.neon_cli.create_branch( + tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch2" + ) + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + completion = exec.submit(client.tenant_size, env.initial_tenant) + wait_until( + 10, + 1.0, + lambda: env.pageserver.assert_log_contains( + f"at failpoint {failpoint}", offset=last_offset + ), + ) + + tenant_delete_wait_completed(client, env.initial_tenant, 10) + + client.configure_failpoints((failpoint, "off")) + + with pytest.raises( + PageserverApiException, match="Failed to refresh gc_info before gathering inputs" + ): + completion.result() + + # this happens on both cases + env.pageserver.allowed_errors.append( + ".*ignoring failure to find gc cutoffs: timeline shutting down.*" + ) + # this happens only in the case of deletion (http response logging) + env.pageserver.allowed_errors.append(".*Failed to refresh gc_info before gathering inputs.*") + + # Helper for tests that compare timeline_inputs # We don't want to compare the exact values, because they can be unstable # and cause flaky tests. So replace the values with useful invariants. diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index eff103ca09..06f2a8befd 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -173,7 +173,9 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder): # which changes the LSN on the page. cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )") vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex() - cur.execute("select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )") + cur.execute( + "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )" + ) vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex() assert vm_page_at_pageserver == vm_page_in_cache diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index ac1a747df3..967d133e18 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1828,7 +1828,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_sk_auth_restart_endpoint") + timeline_id = env.neon_cli.create_branch("test_idle_reconnections") def collect_stats() -> Dict[str, float]: # we need to collect safekeeper_pg_queries_received_total metric from all safekeepers @@ -1859,7 +1859,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): collect_stats() - endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint") + endpoint = env.endpoints.create_start("test_idle_reconnections") # just write something to the timeline endpoint.safe_psql("create table t(i int)") collect_stats() @@ -2007,3 +2007,47 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder): ) log.info(f"dump_control_file response: {res}") assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1" + + +# Test disables periodic pushes from safekeeper to the broker and checks that +# pageserver can still discover safekeepers with discovery requests. +def test_broker_discovery(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS) + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_broker_discovery") + + endpoint = env.endpoints.create_start( + "test_broker_discovery", + config_lines=["shared_buffers=1MB"], + ) + endpoint.safe_psql("create table t(i int, payload text)") + # Install extension containing function needed to clear buffer + endpoint.safe_psql("CREATE EXTENSION neon_test_utils") + + def do_something(): + time.sleep(1) + # generate some data to commit WAL on safekeepers + endpoint.safe_psql("insert into t select generate_series(1,100), 'action'") + # clear the buffers + endpoint.safe_psql("select clear_buffer_cache()") + # read data to fetch pages from pageserver + endpoint.safe_psql("select sum(i) from t") + + do_something() + do_something() + + for sk in env.safekeepers: + # Disable periodic broker push, so pageserver won't be able to discover + # safekeepers without sending a discovery request + sk.stop().start(extra_opts=["--disable-periodic-broker-push"]) + + do_something() + do_something() + + # restart pageserver and check how everything works + env.pageserver.stop().start() + + do_something() + do_something() diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 5902eb3217..dce5616ac6 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -254,7 +254,9 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): ) -def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): +def endpoint_create_start( + env: NeonEnv, branch: str, pgdir_name: Optional[str], allow_multiple: bool = False +): endpoint = Endpoint( env, tenant_id=env.initial_tenant, @@ -268,14 +270,23 @@ def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): # embed current time in endpoint ID endpoint_id = pgdir_name or f"ep-{time.time()}" return endpoint.create_start( - branch_name=branch, endpoint_id=endpoint_id, config_lines=["log_statement=all"] + branch_name=branch, + endpoint_id=endpoint_id, + config_lines=["log_statement=all"], + allow_multiple=allow_multiple, ) async def exec_compute_query( - env: NeonEnv, branch: str, query: str, pgdir_name: Optional[str] = None + env: NeonEnv, + branch: str, + query: str, + pgdir_name: Optional[str] = None, + allow_multiple: bool = False, ): - with endpoint_create_start(env, branch=branch, pgdir_name=pgdir_name) as endpoint: + with endpoint_create_start( + env, branch=branch, pgdir_name=pgdir_name, allow_multiple=allow_multiple + ) as endpoint: before_conn = time.time() conn = await endpoint.connect_async() res = await conn.fetch(query) @@ -347,6 +358,7 @@ class BackgroundCompute(object): self.branch, f"INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key", pgdir_name=f"bgcompute{self.index}_key{verify_key}", + allow_multiple=True, ) log.info(f"result: {res}") if len(res) != 1: diff --git a/trace/src/main.rs b/trace/src/main.rs index 4605c124e9..049f922b6f 100644 --- a/trace/src/main.rs +++ b/trace/src/main.rs @@ -7,7 +7,9 @@ use std::{ io::BufReader, }; -use pageserver_api::models::{PagestreamFeMessage, PagestreamGetPageRequest}; +use pageserver_api::models::{ + PagestreamFeMessage, PagestreamGetPageRequest, PagestreamProtocolVersion, +}; use utils::id::{ConnectionId, TenantId, TimelineId}; use clap::{Parser, Subcommand}; @@ -56,7 +58,7 @@ fn analyze_trace(mut reader: R) { let mut prev: Option = None; // Compute stats - while let Ok(msg) = PagestreamFeMessage::parse(&mut reader) { + while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) { match msg { PagestreamFeMessage::Exists(_) => {} PagestreamFeMessage::Nblocks(_) => {} @@ -89,7 +91,7 @@ fn analyze_trace(mut reader: R) { } fn dump_trace(mut reader: R) { - while let Ok(msg) = PagestreamFeMessage::parse(&mut reader) { + while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) { println!("{msg:?}"); } } diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index a7b4c66156..d6f7e2c604 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit a7b4c66156bce00afa60e5592d4284ba9e40b4cf +Subproject commit d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 64b8c7bccc..f0d6b0ef75 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed +Subproject commit f0d6b0ef7581bd78011832e23d8420a7d2c8a83a diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 3946b2e2ea..8ef3c33aa0 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 3946b2e2ea71d07af092099cb5bcae76a69b90d6 +Subproject commit 8ef3c33aa01631e17cb24a122776349fcc777b46 diff --git a/vendor/revisions.json b/vendor/revisions.json index 75dc095168..a353fde8fd 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "3946b2e2ea71d07af092099cb5bcae76a69b90d6", - "postgres-v15": "64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed", - "postgres-v14": "a7b4c66156bce00afa60e5592d4284ba9e40b4cf" + "postgres-v16": "8ef3c33aa01631e17cb24a122776349fcc777b46", + "postgres-v15": "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a", + "postgres-v14": "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a" } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index c760744491..41ca16f16b 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -5,6 +5,12 @@ commands: user: root sysvInitAction: sysinit shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664' + # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for + # running it as root. + - name: chmod-resize-swap + user: root + sysvInitAction: sysinit + shell: 'chmod 711 /neonvm/bin/resize-swap' - name: pgbouncer user: postgres sysvInitAction: respawn @@ -16,10 +22,19 @@ commands: - name: sql-exporter user: nobody sysvInitAction: respawn - shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml' + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399' + - name: sql-exporter-autoscaling + user: nobody + sysvInitAction: respawn + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: + - filename: compute_ctl-resize-swap + content: | + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap + # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL) + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap - filename: pgbouncer.ini content: | [databases] @@ -88,6 +103,41 @@ files: # Glob patterns are supported (see for syntax). collector_files: - "neon_collector.yml" + - filename: sql_exporter_autoscaling.yml + content: | + # Configuration for sql_exporter for autoscaling-agent + # Global defaults. + global: + # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. + scrape_timeout: 10s + # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. + scrape_timeout_offset: 500ms + # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. + min_interval: 0s + # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, + # as will concurrent scrapes. + max_connections: 1 + # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should + # always be the same as max_connections. + max_idle_connections: 1 + # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. + # If 0, connections are not closed due to a connection's age. + max_connection_lifetime: 5m + + # The target to monitor and the collectors to execute on it. + target: + # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) + # the schema gets dropped or replaced to match the driver expected DSN format. + data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable' + + # Collectors (referenced by name) to execute on the target. + # Glob patterns are supported (see for syntax). + collectors: [neon_collector_autoscaling] + + # Collector files specifies a list of globs. One collector definition is read from each matching file. + # Glob patterns are supported (see for syntax). + collector_files: + - "neon_collector_autoscaling.yml" - filename: neon_collector.yml content: | collector_name: neon_collector @@ -194,6 +244,57 @@ files: values: [approximate_working_set_size] query: | select neon.approximate_working_set_size(false) as approximate_working_set_size; + - filename: neon_collector_autoscaling.yml + content: | + collector_name: neon_collector_autoscaling + metrics: + - metric_name: lfc_misses + type: gauge + help: 'lfc_misses' + key_labels: + values: [lfc_misses] + query: | + select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; + + - metric_name: lfc_used + type: gauge + help: 'LFC chunks used (chunk = 1MB)' + key_labels: + values: [lfc_used] + query: | + select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; + + - metric_name: lfc_hits + type: gauge + help: 'lfc_hits' + key_labels: + values: [lfc_hits] + query: | + select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; + + - metric_name: lfc_writes + type: gauge + help: 'lfc_writes' + key_labels: + values: [lfc_writes] + query: | + select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; + + - metric_name: lfc_cache_size_limit + type: gauge + help: 'LFC cache size limit in bytes' + key_labels: + values: [lfc_cache_size_limit] + query: | + select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; + + - metric_name: lfc_approximate_working_set_size + type: gauge + help: 'Approximate working set size in pages of 8192 bytes' + key_labels: + values: [approximate_working_set_size] + query: | + select neon.approximate_working_set_size(false) as approximate_working_set_size; build: | # Build cgroup-tools @@ -263,17 +364,32 @@ merge: | && echo 'root - nofile 1048576' >>/etc/security/limits.conf \ ) + # Allow postgres user (compute_ctl) to run swap resizer. + # Need to install sudo in order to allow this. + # + # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe. + RUN set -e \ + && apt update \ + && apt install --no-install-recommends -y \ + sudo \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap + COPY cgconfig.conf /etc/cgconfig.conf COPY pgbouncer.ini /etc/pgbouncer.ini COPY sql_exporter.yml /etc/sql_exporter.yml COPY neon_collector.yml /etc/neon_collector.yml + COPY sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml + COPY neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml RUN set -e \ && chown postgres:postgres /etc/pgbouncer.ini \ && chmod 0666 /etc/pgbouncer.ini \ && chmod 0644 /etc/cgconfig.conf \ && chmod 0644 /etc/sql_exporter.yml \ - && chmod 0644 /etc/neon_collector.yml + && chmod 0644 /etc/neon_collector.yml \ + && chmod 0644 /etc/sql_exporter_autoscaling.yml \ + && chmod 0644 /etc/neon_collector_autoscaling.yml COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 7b8228a082..b2da33e44a 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -37,8 +37,7 @@ futures-io = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } -hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] } -hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] } +hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } @@ -58,14 +57,16 @@ rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } -reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] } +reqwest-5ef9efb8ec2df382 = { package = "reqwest", version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] } +reqwest-a6292c17cd707f01 = { package = "reqwest", version = "0.11", default-features = false, features = ["blocking", "default-tls", "stream"] } rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } sha2 = { version = "0.10", features = ["asm"] } -smallvec = { version = "1", default-features = false, features = ["write"] } +smallvec = { version = "1", default-features = false, features = ["const_new", "write"] } subtle = { version = "2" } +sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] } time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-rustls = { version = "0.24" } @@ -76,7 +77,6 @@ tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } -tungstenite = { version = "0.20" } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } zeroize = { version = "1", features = ["derive"] } @@ -91,7 +91,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } -hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] } +hashbrown = { version = "0.14", features = ["raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits", "use_std"] }