diff --git a/.dockerignore b/.dockerignore index 8b378b5dab..f7a6232ba1 100644 --- a/.dockerignore +++ b/.dockerignore @@ -22,6 +22,7 @@ !s3_scrubber/ !safekeeper/ !storage_broker/ +!storage_controller/ !trace/ !vendor/postgres-*/ !workspace_hack/ diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index 1ecb5ecc7e..f84beff20c 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -150,7 +150,7 @@ runs: # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work, # and to keep files on the host to upload them to the database - time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}" + time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/" # Generate redirect cat < ${WORKDIR}/index.html diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index f1eea34ab9..dea3fc2357 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -10,7 +10,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build outputs: dsn: description: 'Created Branch DSN (for main database)' diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml index f8cd351dd9..8acba7ad00 100644 --- a/.github/actions/neon-branch-delete/action.yml +++ b/.github/actions/neon-branch-delete/action.yml @@ -13,7 +13,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build runs: using: "composite" diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index ae6464990e..7f0e599b97 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -13,7 +13,7 @@ inputs: default: 15 api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build provisioner: desctiption: 'k8s-pod or k8s-neonvm' default: 'k8s-pod' diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml index adc8510a34..b8ec6cac70 100644 --- a/.github/actions/neon-project-delete/action.yml +++ b/.github/actions/neon-project-delete/action.yml @@ -10,7 +10,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build runs: using: "composite" diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml index 69c48d86b9..ab616d17e2 100644 --- a/.github/workflows/approved-for-ci-run.yml +++ b/.github/workflows/approved-for-ci-run.yml @@ -18,6 +18,7 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number }} + cancel-in-progress: false env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 2e56bf909f..1eaf05cd54 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -147,15 +147,16 @@ jobs: "neonvm-captest-new" ], "db_size": [ "10gb" ], - "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, - { "platform": "neon-captest-new", "db_size": "50gb" }, - { "platform": "neonvm-captest-freetier", "db_size": "3gb" }, - { "platform": "neonvm-captest-new", "db_size": "50gb" }] + "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, + { "platform": "neon-captest-new", "db_size": "50gb" }, + { "platform": "neonvm-captest-freetier", "db_size": "3gb" }, + { "platform": "neonvm-captest-new", "db_size": "50gb" }, + { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }] }' if [ "$(date +%A)" = "Saturday" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"}, - { "platform": "rds-aurora", "db_size": "50gb"}]') + { "platform": "rds-aurora", "db_size": "50gb"}]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -171,7 +172,7 @@ jobs: if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" }, - { "platform": "rds-aurora" }]') + { "platform": "rds-aurora" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -190,7 +191,7 @@ jobs: if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" }, - { "platform": "rds-aurora", "scale": "10" }]') + { "platform": "rds-aurora", "scale": "10" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -253,6 +254,9 @@ jobs: neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; + neonvm-captest-sharding-reuse) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }} + ;; neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; @@ -270,11 +274,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: Benchmark init uses: ./.github/actions/run-python-test-set @@ -401,11 +409,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: ClickBench benchmark uses: ./.github/actions/run-python-test-set @@ -507,11 +519,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: Run TPC-H benchmark uses: ./.github/actions/run-python-test-set @@ -597,11 +613,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: Run user examples uses: ./.github/actions/run-python-test-set diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 251423e701..c527cef1ac 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -21,6 +21,7 @@ defaults: concurrency: group: build-build-tools-image-${{ inputs.image-tag }} + cancel-in-progress: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d27713f083..1d35fa9223 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1127,15 +1127,15 @@ jobs: -f deployProxy=false \ -f deployStorage=true \ -f deployStorageBroker=true \ + -f deployStorageController=true \ -f branch=main \ -f dockerTag=${{needs.tag.outputs.build-tag}} \ -f deployPreprodRegion=true gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \ - -f deployPgSniRouter=false \ - -f deployProxy=false \ -f deployStorage=true \ -f deployStorageBroker=true \ + -f deployStorageController=true \ -f branch=main \ -f dockerTag=${{needs.tag.outputs.build-tag}} elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then @@ -1144,6 +1144,7 @@ jobs: -f deployProxy=true \ -f deployStorage=false \ -f deployStorageBroker=false \ + -f deployStorageController=false \ -f branch=main \ -f dockerTag=${{needs.tag.outputs.build-tag}} \ -f deployPreprodRegion=true diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml index 28646dfc19..a1e22cf93f 100644 --- a/.github/workflows/check-build-tools-image.yml +++ b/.github/workflows/check-build-tools-image.yml @@ -28,7 +28,9 @@ jobs: - name: Get build-tools image tag for the current commit id: get-build-tools-tag env: - COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs, + # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly. + COMMIT_SHA: ${{ github.sha }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | LAST_BUILD_TOOLS_SHA=$( diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index c941692066..d495a158e8 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -20,6 +20,7 @@ defaults: concurrency: group: pin-build-tools-image-${{ inputs.from-tag }} + cancel-in-progress: false permissions: {} diff --git a/CODEOWNERS b/CODEOWNERS index 9a23e8c958..af2fa6088e 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,5 +1,5 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute -/control_plane/attachment_service @neondatabase/storage +/storage_controller @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage /libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers /libs/remote_storage/ @neondatabase/storage diff --git a/Cargo.lock b/Cargo.lock index c1c245fa9c..6faf4b72f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -271,42 +271,10 @@ dependencies = [ ] [[package]] -name = "attachment_service" -version = "0.1.0" -dependencies = [ - "anyhow", - "aws-config", - "bytes", - "camino", - "clap", - "control_plane", - "diesel", - "diesel_migrations", - "fail", - "futures", - "git-version", - "hex", - "humantime", - "hyper", - "lasso", - "measured", - "metrics", - "once_cell", - "pageserver_api", - "pageserver_client", - "postgres_connection", - "r2d2", - "reqwest", - "routerify", - "serde", - "serde_json", - "thiserror", - "tokio", - "tokio-util", - "tracing", - "utils", - "workspace_hack", -] +name = "atomic-take" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3" [[package]] name = "autocfg" @@ -336,7 +304,7 @@ dependencies = [ "fastrand 2.0.0", "hex", "http 0.2.9", - "hyper", + "hyper 0.14.26", "ring 0.17.6", "time", "tokio", @@ -373,7 +341,7 @@ dependencies = [ "bytes", "fastrand 2.0.0", "http 0.2.9", - "http-body", + "http-body 0.4.5", "percent-encoding", "pin-project-lite", "tracing", @@ -424,7 +392,7 @@ dependencies = [ "aws-types", "bytes", "http 0.2.9", - "http-body", + "http-body 0.4.5", "once_cell", "percent-encoding", "regex-lite", @@ -552,7 +520,7 @@ dependencies = [ "crc32fast", "hex", "http 0.2.9", - "http-body", + "http-body 0.4.5", "md-5", "pin-project-lite", "sha1", @@ -584,7 +552,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.9", - "http-body", + "http-body 0.4.5", "once_cell", "percent-encoding", "pin-project-lite", @@ -623,10 +591,10 @@ dependencies = [ "aws-smithy-types", "bytes", "fastrand 2.0.0", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-rustls", "once_cell", "pin-project-lite", @@ -664,7 +632,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.9", - "http-body", + "http-body 0.4.5", "itoa", "num-integer", "pin-project-lite", @@ -713,8 +681,8 @@ dependencies = [ "bytes", "futures-util", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "itoa", "matchit", "memchr", @@ -729,7 +697,7 @@ dependencies = [ "sha1", "sync_wrapper", "tokio", - "tokio-tungstenite", + "tokio-tungstenite 0.20.0", "tower", "tower-layer", "tower-service", @@ -745,7 +713,7 @@ dependencies = [ "bytes", "futures-util", "http 0.2.9", - "http-body", + "http-body 0.4.5", "mime", "rustversion", "tower-layer", @@ -1162,7 +1130,7 @@ version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "syn 2.0.52", @@ -1234,7 +1202,7 @@ dependencies = [ "compute_api", "flate2", "futures", - "hyper", + "hyper 0.14.26", "nix 0.27.1", "notify", "num_cpus", @@ -1351,7 +1319,7 @@ dependencies = [ "git-version", "hex", "humantime", - "hyper", + "hyper 0.14.26", "nix 0.27.1", "once_cell", "pageserver_api", @@ -1500,12 +1468,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.15" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" -dependencies = [ - "cfg-if", -] +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "crossterm" @@ -1878,23 +1843,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.1" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys 0.48.0", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", "libc", + "windows-sys 0.52.0", ] [[package]] @@ -2234,9 +2188,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.3.24" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9" +checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" dependencies = [ "bytes", "fnv", @@ -2251,6 +2205,25 @@ dependencies = [ "tracing", ] +[[package]] +name = "h2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http 1.1.0", + "indexmap 2.0.1", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "half" version = "1.8.2" @@ -2332,6 +2305,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.3.3" @@ -2416,6 +2395,29 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-body" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +dependencies = [ + "bytes", + "http 1.1.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "pin-project-lite", +] + [[package]] name = "http-types" version = "2.12.0" @@ -2474,9 +2476,9 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", + "http-body 0.4.5", "httparse", "httpdate", "itoa", @@ -2488,6 +2490,26 @@ dependencies = [ "want", ] +[[package]] +name = "hyper" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2 0.4.4", + "http 1.1.0", + "http-body 1.0.0", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", +] + [[package]] name = "hyper-rustls" version = "0.24.0" @@ -2495,7 +2517,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" dependencies = [ "http 0.2.9", - "hyper", + "hyper 0.14.26", "log", "rustls 0.21.9", "rustls-native-certs 0.6.2", @@ -2509,7 +2531,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" dependencies = [ - "hyper", + "hyper 0.14.26", "pin-project-lite", "tokio", "tokio-io-timeout", @@ -2522,7 +2544,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" dependencies = [ "bytes", - "hyper", + "hyper 0.14.26", "native-tls", "tokio", "tokio-native-tls", @@ -2530,15 +2552,33 @@ dependencies = [ [[package]] name = "hyper-tungstenite" -version = "0.11.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9" +checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad" dependencies = [ - "hyper", + "http-body-util", + "hyper 1.2.0", + "hyper-util", "pin-project-lite", "tokio", - "tokio-tungstenite", - "tungstenite", + "tokio-tungstenite 0.21.0", + "tungstenite 0.21.0", +] + +[[package]] +name = "hyper-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "hyper 1.2.0", + "pin-project-lite", + "socket2 0.5.5", + "tokio", ] [[package]] @@ -2832,6 +2872,12 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" +[[package]] +name = "linux-raw-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + [[package]] name = "lock_api" version = "0.4.10" @@ -2886,11 +2932,12 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "measured" -version = "0.0.13" +version = "0.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f" +checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5" dependencies = [ "bytes", + "crossbeam-utils", "hashbrown 0.14.0", "itoa", "lasso", @@ -2903,16 +2950,27 @@ dependencies = [ [[package]] name = "measured-derive" -version = "0.0.13" +version = "0.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80" +checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn 2.0.52", ] +[[package]] +name = "measured-process" +version = "0.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000" +dependencies = [ + "libc", + "measured", + "procfs 0.16.0", +] + [[package]] name = "memchr" version = "2.6.4" @@ -2952,8 +3010,10 @@ version = "0.1.0" dependencies = [ "chrono", "libc", + "measured", + "measured-process", "once_cell", - "procfs", + "procfs 0.14.2", "prometheus", "rand 0.8.5", "rand_distr", @@ -3435,9 +3495,9 @@ dependencies = [ [[package]] name = "ordered-multimap" -version = "0.7.1" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f" +checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79" dependencies = [ "dlv-list", "hashbrown 0.14.0", @@ -3503,12 +3563,17 @@ dependencies = [ "camino", "clap", "git-version", + "humantime", "pageserver", + "pageserver_api", "postgres_ffi", + "remote_storage", "serde", "serde_json", "svg_fmt", "tokio", + "tokio-util", + "toml_edit", "utils", "workspace_hack", ] @@ -3544,7 +3609,7 @@ dependencies = [ "hex-literal", "humantime", "humantime-serde", - "hyper", + "hyper 0.14.26", "itertools", "leaky-bucket", "md5", @@ -3563,7 +3628,7 @@ dependencies = [ "postgres_connection", "postgres_ffi", "pq_proto", - "procfs", + "procfs 0.14.2", "rand 0.8.5", "regex", "remote_storage", @@ -3654,7 +3719,6 @@ dependencies = [ "anyhow", "async-compression", "async-stream", - "async-trait", "byteorder", "bytes", "chrono", @@ -4124,6 +4188,29 @@ dependencies = [ "rustix 0.36.16", ] +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.4.1", + "hex", + "lazy_static", + "procfs-core", + "rustix 0.38.28", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.4.1", + "hex", +] + [[package]] name = "prometheus" version = "0.13.3" @@ -4136,7 +4223,7 @@ dependencies = [ "libc", "memchr", "parking_lot 0.12.1", - "procfs", + "procfs 0.14.2", "thiserror", ] @@ -4157,7 +4244,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", - "heck", + "heck 0.4.1", "itertools", "lazy_static", "log", @@ -4199,7 +4286,9 @@ name = "proxy" version = "0.1.0" dependencies = [ "anyhow", + "async-compression", "async-trait", + "atomic-take", "aws-config", "aws-sdk-iam", "aws-sigv4", @@ -4223,13 +4312,17 @@ dependencies = [ "hmac", "hostname", "http 1.1.0", + "http-body-util", "humantime", - "hyper", + "hyper 0.14.26", + "hyper 1.2.0", "hyper-tungstenite", + "hyper-util", "ipnet", "itertools", "lasso", "md5", + "measured", "metrics", "native-tls", "once_cell", @@ -4558,7 +4651,7 @@ dependencies = [ "futures-util", "http-types", "humantime", - "hyper", + "hyper 0.14.26", "itertools", "metrics", "once_cell", @@ -4588,10 +4681,10 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-rustls", "hyper-tls", "ipnet", @@ -4649,7 +4742,7 @@ dependencies = [ "futures", "getrandom 0.2.11", "http 0.2.9", - "hyper", + "hyper 0.14.26", "parking_lot 0.11.2", "reqwest", "reqwest-middleware", @@ -4736,7 +4829,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945" dependencies = [ "http 0.2.9", - "hyper", + "hyper 0.14.26", "lazy_static", "percent-encoding", "regex", @@ -4848,6 +4941,19 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "rustix" +version = "0.38.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +dependencies = [ + "bitflags 2.4.1", + "errno", + "libc", + "linux-raw-sys 0.4.13", + "windows-sys 0.52.0", +] + [[package]] name = "rustls" version = "0.21.9" @@ -5028,7 +5134,7 @@ dependencies = [ "git-version", "hex", "humantime", - "hyper", + "hyper 0.14.26", "metrics", "once_cell", "parking_lot 0.12.1", @@ -5513,9 +5619,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "smol_str" @@ -5607,7 +5713,7 @@ dependencies = [ "futures-util", "git-version", "humantime", - "hyper", + "hyper 0.14.26", "metrics", "once_cell", "parking_lot 0.12.1", @@ -5621,6 +5727,65 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "storage_controller" +version = "0.1.0" +dependencies = [ + "anyhow", + "aws-config", + "bytes", + "camino", + "clap", + "control_plane", + "diesel", + "diesel_migrations", + "fail", + "futures", + "git-version", + "hex", + "humantime", + "hyper 0.14.26", + "itertools", + "lasso", + "measured", + "metrics", + "once_cell", + "pageserver_api", + "pageserver_client", + "postgres_connection", + "r2d2", + "reqwest", + "routerify", + "serde", + "serde_json", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "utils", + "workspace_hack", +] + +[[package]] +name = "storcon_cli" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "comfy-table", + "hyper 0.14.26", + "pageserver_api", + "pageserver_client", + "reqwest", + "serde", + "serde_json", + "thiserror", + "tokio", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "stringprep" version = "0.1.2" @@ -5649,7 +5814,7 @@ version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "rustversion", @@ -5777,23 +5942,23 @@ dependencies = [ [[package]] name = "test-context" -version = "0.1.4" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3" +checksum = "6676ab8513edfd2601a108621103fdb45cac9098305ca25ec93f7023b06b05d9" dependencies = [ - "async-trait", "futures", "test-context-macros", ] [[package]] name = "test-context-macros" -version = "0.1.4" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d" +checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1" dependencies = [ + "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] @@ -5934,9 +6099,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.36.0" +version = "1.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" +checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" dependencies = [ "backtrace", "bytes", @@ -6091,7 +6256,19 @@ dependencies = [ "futures-util", "log", "tokio", - "tungstenite", + "tungstenite 0.20.1", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38" +dependencies = [ + "futures-util", + "log", + "tokio", + "tungstenite 0.21.0", ] [[package]] @@ -6158,10 +6335,10 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-timeout", "percent-encoding", "pin-project", @@ -6347,7 +6524,7 @@ dependencies = [ name = "tracing-utils" version = "0.1.0" dependencies = [ - "hyper", + "hyper 0.14.26", "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", @@ -6384,6 +6561,25 @@ dependencies = [ "utf-8", ] +[[package]] +name = "tungstenite" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1" +dependencies = [ + "byteorder", + "bytes", + "data-encoding", + "http 1.1.0", + "httparse", + "log", + "rand 0.8.5", + "sha1", + "thiserror", + "url", + "utf-8", +] + [[package]] name = "twox-hash" version = "1.6.3" @@ -6548,7 +6744,8 @@ dependencies = [ "heapless", "hex", "hex-literal", - "hyper", + "humantime", + "hyper 0.14.26", "jsonwebtoken", "leaky-bucket", "metrics", @@ -6908,6 +7105,15 @@ dependencies = [ "windows-targets 0.48.0", ] +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.4", +] + [[package]] name = "windows-targets" version = "0.42.2" @@ -6938,6 +7144,21 @@ dependencies = [ "windows_x86_64_msvc 0.48.0", ] +[[package]] +name = "windows-targets" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +dependencies = [ + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" @@ -6950,6 +7171,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" + [[package]] name = "windows_aarch64_msvc" version = "0.42.2" @@ -6962,6 +7189,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" + [[package]] name = "windows_i686_gnu" version = "0.42.2" @@ -6974,6 +7207,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +[[package]] +name = "windows_i686_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" + [[package]] name = "windows_i686_msvc" version = "0.42.2" @@ -6986,6 +7225,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +[[package]] +name = "windows_i686_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" + [[package]] name = "windows_x86_64_gnu" version = "0.42.2" @@ -6998,6 +7243,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" @@ -7010,6 +7261,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" + [[package]] name = "windows_x86_64_msvc" version = "0.42.2" @@ -7022,6 +7279,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" + [[package]] name = "winnow" version = "0.4.6" @@ -7070,11 +7333,10 @@ dependencies = [ "futures-sink", "futures-util", "getrandom 0.2.11", - "hashbrown 0.13.2", "hashbrown 0.14.0", "hex", "hmac", - "hyper", + "hyper 0.14.26", "indexmap 1.9.3", "itertools", "libc", @@ -7112,7 +7374,6 @@ dependencies = [ "tower", "tracing", "tracing-core", - "tungstenite", "url", "uuid", "zeroize", diff --git a/Cargo.toml b/Cargo.toml index 309ebbe119..8310d2d522 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = [ "compute_tools", "control_plane", - "control_plane/attachment_service", + "control_plane/storcon_cli", "pageserver", "pageserver/compaction", "pageserver/ctl", @@ -12,6 +12,7 @@ members = [ "proxy", "safekeeper", "storage_broker", + "storage_controller", "s3_scrubber", "workspace_hack", "trace", @@ -43,6 +44,7 @@ license = "Apache-2.0" anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } +atomic-take = "1.1.0" azure_core = "0.18" azure_identity = "0.18" azure_storage = "0.18" @@ -96,7 +98,7 @@ http-types = { version = "2", default-features = false } humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" -hyper-tungstenite = "0.11" +hyper-tungstenite = "0.13.0" inotify = "0.10.2" ipnet = "2.9.0" itertools = "0.10" @@ -105,7 +107,8 @@ lasso = "0.7" leaky-bucket = "1.0.1" libc = "0.2" md5 = "0.7.0" -measured = { version = "0.0.13", features=["default", "lasso"] } +measured = { version = "0.0.21", features=["lasso"] } +measured-process = { version = "0.0.21" } memoffset = "0.8" native-tls = "0.2" nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } @@ -158,7 +161,7 @@ svg_fmt = "0.4.1" sync_wrapper = "0.1.2" tar = "0.4" task-local-extensions = "0.1.4" -test-context = "0.1" +test-context = "0.3" thiserror = "1.0" tikv-jemallocator = "0.5" tikv-jemalloc-ctl = "0.5" diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 1ed6f87473..a082f15c34 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -58,6 +58,12 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$ && mv protoc/include/google /usr/local/include/google \ && rm -rf protoc.zip protoc +# s5cmd +ENV S5CMD_VERSION=2.2.2 +RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \ + && chmod +x s5cmd \ + && mv s5cmd /usr/local/bin/s5cmd + # LLVM ENV LLVM_VERSION=17 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index c73b9ce5c9..bd4534ce1d 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -944,6 +944,9 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl +# Create remote extension download directory +RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions + # Install: # libreadline8 for psql # libicu67, locales for collations (including ICU and plpgsql_check) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 0fa315682d..40060f4117 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -818,9 +818,15 @@ impl ComputeNode { Client::connect(zenith_admin_connstr.as_str(), NoTls) .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; // Disable forwarding so that users don't get a cloud_admin role - client.simple_query("SET neon.forward_ddl = false")?; - client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; - client.simple_query("GRANT zenith_admin TO cloud_admin")?; + + let mut func = || { + client.simple_query("SET neon.forward_ddl = false")?; + client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; + client.simple_query("GRANT zenith_admin TO cloud_admin")?; + Ok::<_, anyhow::Error>(()) + }; + func().context("apply_config setup cloud_admin")?; + drop(client); // reconnect with connstring with expected name @@ -832,24 +838,29 @@ impl ComputeNode { }; // Disable DDL forwarding because control plane already knows about these roles/databases. - client.simple_query("SET neon.forward_ddl = false")?; + client + .simple_query("SET neon.forward_ddl = false") + .context("apply_config SET neon.forward_ddl = false")?; // Proceed with post-startup configuration. Note, that order of operations is important. let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; - create_neon_superuser(spec, &mut client)?; - cleanup_instance(&mut client)?; - handle_roles(spec, &mut client)?; - handle_databases(spec, &mut client)?; - handle_role_deletions(spec, connstr.as_str(), &mut client)?; + create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?; + cleanup_instance(&mut client).context("apply_config cleanup_instance")?; + handle_roles(spec, &mut client).context("apply_config handle_roles")?; + handle_databases(spec, &mut client).context("apply_config handle_databases")?; + handle_role_deletions(spec, connstr.as_str(), &mut client) + .context("apply_config handle_role_deletions")?; handle_grants( spec, &mut client, connstr.as_str(), self.has_feature(ComputeFeature::AnonExtension), - )?; - handle_extensions(spec, &mut client)?; - handle_extension_neon(&mut client)?; - create_availability_check_data(&mut client)?; + ) + .context("apply_config handle_grants")?; + handle_extensions(spec, &mut client).context("apply_config handle_extensions")?; + handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?; + create_availability_check_data(&mut client) + .context("apply_config create_availability_check_data")?; // 'Close' connection drop(client); @@ -857,7 +868,7 @@ impl ComputeNode { // Run migrations separately to not hold up cold starts thread::spawn(move || { let mut client = Client::connect(connstr.as_str(), NoTls)?; - handle_migrations(&mut client) + handle_migrations(&mut client).context("apply_config handle_migrations") }); Ok(()) } @@ -1262,10 +1273,12 @@ LIMIT 100", .await .map_err(DownloadError::Other); - self.ext_download_progress - .write() - .expect("bad lock") - .insert(ext_archive_name.to_string(), (download_start, true)); + if download_size.is_ok() { + self.ext_download_progress + .write() + .expect("bad lock") + .insert(ext_archive_name.to_string(), (download_start, true)); + } download_size } diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index f1fd8637f5..89c866b20c 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,8 +6,8 @@ use std::path::Path; use anyhow::Result; use crate::pg_helpers::escape_conf_value; -use crate::pg_helpers::PgOptionsSerialize; -use compute_api::spec::{ComputeMode, ComputeSpec}; +use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize}; +use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -92,6 +92,27 @@ pub fn write_postgres_conf( } } + if cfg!(target_os = "linux") { + // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is + // disabled), then the control plane has enabled swap and we should set + // dynamic_shared_memory_type = 'mmap'. + // + // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047. + let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory") + // ignore any errors - they may be expected to occur under certain situations (e.g. when + // not running in Linux). + .unwrap_or_else(|_| String::new()); + if overcommit_memory_contents.trim() == "2" { + let opt = GenericOption { + name: "dynamic_shared_memory_type".to_owned(), + value: Some("mmap".to_owned()), + vartype: "enum".to_owned(), + }; + + write!(file, "{}", opt.to_pg_setting())?; + } + } + // If there are any extra options in the 'settings' field, append those if spec.cluster.settings.is_some() { writeln!(file, "# Managed by compute_ctl: begin")?; diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 5deb50d6b7..fa0822748b 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String { format!("'{}'", res) } -trait GenericOptionExt { +pub trait GenericOptionExt { fn to_pg_option(&self) -> String; fn to_pg_setting(&self) -> String; } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 4006062fc2..269177ee16 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -2,7 +2,7 @@ use std::fs::File; use std::path::Path; use std::str::FromStr; -use anyhow::{anyhow, bail, Result}; +use anyhow::{anyhow, bail, Context, Result}; use postgres::config::Config; use postgres::{Client, NoTls}; use reqwest::StatusCode; @@ -698,7 +698,8 @@ pub fn handle_grants( // it is important to run this after all grants if enable_anon_extension { - handle_extension_anon(spec, &db.owner, &mut db_client, false)?; + handle_extension_anon(spec, &db.owner, &mut db_client, false) + .context("handle_grants handle_extension_anon")?; } } @@ -743,21 +744,24 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> { // which may happen in two cases: // - extension was just installed // - extension was already installed and is up to date - // DISABLED due to compute node unpinning epic - // let query = "ALTER EXTENSION neon UPDATE"; - // info!("update neon extension version with query: {}", query); - // client.simple_query(query)?; + let query = "ALTER EXTENSION neon UPDATE"; + info!("update neon extension version with query: {}", query); + if let Err(e) = client.simple_query(query) { + error!( + "failed to upgrade neon extension during `handle_extension_neon`: {}", + e + ); + } Ok(()) } #[instrument(skip_all)] -pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> { - info!("handle neon extension upgrade (not really)"); - // DISABLED due to compute node unpinning epic - // let query = "ALTER EXTENSION neon UPDATE"; - // info!("update neon extension version with query: {}", query); - // client.simple_query(query)?; +pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { + info!("handle neon extension upgrade"); + let query = "ALTER EXTENSION neon UPDATE"; + info!("update neon extension version with query: {}", query); + client.simple_query(query)?; Ok(()) } @@ -810,28 +814,36 @@ $$;"#, // Add new migrations below. ]; - let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; - client.simple_query(query)?; + let mut func = || { + let query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; + client.simple_query(query)?; - query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; - client.simple_query(query)?; + let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; + client.simple_query(query)?; - query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; - client.simple_query(query)?; + let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; + client.simple_query(query)?; - query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; - client.simple_query(query)?; + let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; + client.simple_query(query)?; - query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; - client.simple_query(query)?; + let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; + client.simple_query(query)?; + Ok::<_, anyhow::Error>(()) + }; + func().context("handle_migrations prepare")?; - query = "SELECT id FROM neon_migration.migration_id"; - let row = client.query_one(query, &[])?; + let query = "SELECT id FROM neon_migration.migration_id"; + let row = client + .query_one(query, &[]) + .context("handle_migrations get migration_id")?; let mut current_migration: usize = row.get::<&str, i64>("id") as usize; let starting_migration_id = current_migration; - query = "BEGIN"; - client.simple_query(query)?; + let query = "BEGIN"; + client + .simple_query(query) + .context("handle_migrations begin")?; while current_migration < migrations.len() { let migration = &migrations[current_migration]; @@ -839,7 +851,9 @@ $$;"#, info!("Skip migration id={}", current_migration); } else { info!("Running migration:\n{}\n", migration); - client.simple_query(migration)?; + client.simple_query(migration).with_context(|| { + format!("handle_migrations current_migration={}", current_migration) + })?; } current_migration += 1; } @@ -847,10 +861,14 @@ $$;"#, "UPDATE neon_migration.migration_id SET id={}", migrations.len() ); - client.simple_query(&setval)?; + client + .simple_query(&setval) + .context("handle_migrations update id")?; - query = "COMMIT"; - client.simple_query(query)?; + let query = "COMMIT"; + client + .simple_query(query) + .context("handle_migrations commit")?; info!( "Ran {} migrations", diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 2fced7d778..94666f2870 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -86,7 +86,10 @@ where .stdout(process_log_file) .stderr(same_file_for_stderr) .args(args); - let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command)); + + let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars( + fill_rust_env_vars(background_command), + )); filled_cmd.envs(envs); let pid_file_to_check = match &initial_pid_file { @@ -268,6 +271,15 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { cmd } +fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command { + for (var, val) in std::env::vars() { + if var.starts_with("NEON_PAGESERVER_") { + cmd = cmd.env(var, val); + } + } + cmd +} + /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(), /// 1. Claims a pidfile with a fcntl lock on it and /// 2. Sets up the pidfile's file descriptor so that it (and the lock) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 401feae706..7f8f6d21e0 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -14,9 +14,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR}; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage_controller::StorageController; use control_plane::{broker, local_env}; -use pageserver_api::controller_api::{ - NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy, -}; +use pageserver_api::controller_api::PlacementPolicy; use pageserver_api::models::{ ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo, }; @@ -1060,21 +1058,6 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> } } - Some(("set-state", subcommand_args)) => { - let pageserver = get_pageserver(env, subcommand_args)?; - let scheduling = subcommand_args.get_one("scheduling"); - let availability = subcommand_args.get_one("availability"); - - let storage_controller = StorageController::from_env(env); - storage_controller - .node_configure(NodeConfigureRequest { - node_id: pageserver.conf.id, - scheduling: scheduling.cloned(), - availability: availability.cloned(), - }) - .await?; - } - Some(("status", subcommand_args)) => { match get_pageserver(env, subcommand_args)?.check_status().await { Ok(_) => println!("Page server is up and running"), @@ -1248,7 +1231,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { match ComputeControlPlane::load(env.clone()) { Ok(cplane) => { for (_k, node) in cplane.endpoints { - if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) { + if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) { eprintln!("postgres stop failed: {e:#}"); } } @@ -1434,6 +1417,7 @@ fn cli() -> Command { .subcommand( Command::new("timeline") .about("Manage timelines") + .arg_required_else_help(true) .subcommand(Command::new("list") .about("List all timelines, available to this pageserver") .arg(tenant_id_arg.clone())) @@ -1515,12 +1499,6 @@ fn cli() -> Command { .about("Restart local pageserver") .arg(pageserver_config_args.clone()) ) - .subcommand(Command::new("set-state") - .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active")) - .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active")) - .about("Set scheduling or availability state of pageserver node") - .arg(pageserver_config_args.clone()) - ) ) .subcommand( Command::new("storage_controller") diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index bd3dbef453..38b7fffd09 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -156,6 +156,7 @@ pub struct SafekeeperConf { pub remote_storage: Option, pub backup_threads: Option, pub auth_enabled: bool, + pub listen_addr: Option, } impl Default for SafekeeperConf { @@ -169,6 +170,7 @@ impl Default for SafekeeperConf { remote_storage: None, backup_threads: None, auth_enabled: false, + listen_addr: None, } } } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index c5eabc46db..abf815f07a 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -389,6 +389,10 @@ impl PageServerNode { .remove("image_creation_threshold") .map(|x| x.parse::()) .transpose()?, + image_layer_creation_check_threshold: settings + .remove("image_layer_creation_check_threshold") + .map(|x| x.parse::()) + .transpose()?, pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") @@ -501,6 +505,12 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_threshold' as non zero integer")?, + image_layer_creation_check_threshold: settings + .remove("image_layer_creation_check_threshold") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'image_creation_check_threshold' as integer")?, + pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 6ac71dfe51..d62a2e80b5 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -70,24 +70,31 @@ pub struct SafekeeperNode { pub pg_connection_config: PgConnectionConfig, pub env: LocalEnv, pub http_client: reqwest::Client, + pub listen_addr: String, pub http_base_url: String, } impl SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { + let listen_addr = if let Some(ref listen_addr) = conf.listen_addr { + listen_addr.clone() + } else { + "127.0.0.1".to_string() + }; SafekeeperNode { id: conf.id, conf: conf.clone(), - pg_connection_config: Self::safekeeper_connection_config(conf.pg_port), + pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port), env: env.clone(), http_client: reqwest::Client::new(), - http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port), + http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port), + listen_addr, } } /// Construct libpq connection string for connecting to this safekeeper. - fn safekeeper_connection_config(port: u16) -> PgConnectionConfig { - PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port) + fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig { + PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port) } pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { @@ -111,8 +118,8 @@ impl SafekeeperNode { ); io::stdout().flush().unwrap(); - let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port); - let listen_http = format!("127.0.0.1:{}", self.conf.http_port); + let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port); + let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port); let id = self.id; let datadir = self.datadir_path(); @@ -139,7 +146,7 @@ impl SafekeeperNode { availability_zone, ]; if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port { - let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port); + let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port); args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]); } if !self.conf.sync { diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml new file mode 100644 index 0000000000..61eb7fa4e4 --- /dev/null +++ b/control_plane/storcon_cli/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "storcon_cli" +version = "0.1.0" +edition.workspace = true +license.workspace = true + + +[dependencies] +anyhow.workspace = true +clap.workspace = true +comfy-table.workspace = true +hyper.workspace = true +pageserver_api.workspace = true +pageserver_client.workspace = true +reqwest.workspace = true +serde.workspace = true +serde_json = { workspace = true, features = ["raw_value"] } +thiserror.workspace = true +tokio.workspace = true +tracing.workspace = true +utils.workspace = true +workspace_hack.workspace = true + diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs new file mode 100644 index 0000000000..2edd09eac1 --- /dev/null +++ b/control_plane/storcon_cli/src/main.rs @@ -0,0 +1,587 @@ +use std::{collections::HashMap, str::FromStr}; + +use clap::{Parser, Subcommand}; +use hyper::Method; +use pageserver_api::{ + controller_api::{ + NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, + TenantDescribeResponse, TenantPolicyRequest, + }, + models::{ + ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest, + TenantShardSplitRequest, TenantShardSplitResponse, + }, + shard::{ShardStripeSize, TenantShardId}, +}; +use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; +use reqwest::Url; +use serde::{de::DeserializeOwned, Serialize}; +use utils::id::{NodeId, TenantId}; + +use pageserver_api::controller_api::{ + NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, + TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse, +}; + +#[derive(Subcommand, Debug)] +enum Command { + /// Register a pageserver with the storage controller. This shouldn't usually be necessary, + /// since pageservers auto-register when they start up + NodeRegister { + #[arg(long)] + node_id: NodeId, + + #[arg(long)] + listen_pg_addr: String, + #[arg(long)] + listen_pg_port: u16, + + #[arg(long)] + listen_http_addr: String, + #[arg(long)] + listen_http_port: u16, + }, + + /// Modify a node's configuration in the storage controller + NodeConfigure { + #[arg(long)] + node_id: NodeId, + + /// Availability is usually auto-detected based on heartbeats. Set 'offline' here to + /// manually mark a node offline + #[arg(long)] + availability: Option, + /// Scheduling policy controls whether tenant shards may be scheduled onto this node. + #[arg(long)] + scheduling: Option, + }, + /// Modify a tenant's policies in the storage controller + TenantPolicy { + #[arg(long)] + tenant_id: TenantId, + /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`), + /// or is in the normal attached state with N secondary locations (`attached:N`) + #[arg(long)] + placement: Option, + /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal, + /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents + /// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant + /// unavailable, and are only for use in emergencies. + #[arg(long)] + scheduling: Option, + }, + /// List nodes known to the storage controller + Nodes {}, + /// List tenants known to the storage controller + Tenants {}, + /// Create a new tenant in the storage controller, and by extension on pageservers. + TenantCreate { + #[arg(long)] + tenant_id: TenantId, + }, + /// Delete a tenant in the storage controller, and by extension on pageservers. + TenantDelete { + #[arg(long)] + tenant_id: TenantId, + }, + /// Split an existing tenant into a higher number of shards than its current shard count. + TenantShardSplit { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + shard_count: u8, + /// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes. + #[arg(long)] + stripe_size: Option, + }, + /// Migrate the attached location for a tenant shard to a specific pageserver. + TenantShardMigrate { + #[arg(long)] + tenant_shard_id: TenantShardId, + #[arg(long)] + node: NodeId, + }, + /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure + /// that is passed through to pageservers, and does not affect storage controller behavior. + TenantConfig { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + config: String, + }, + /// Attempt to balance the locations for a tenant across pageservers. This is a client-side + /// alternative to the storage controller's scheduling optimization behavior. + TenantScatter { + #[arg(long)] + tenant_id: TenantId, + }, + /// Print details about a particular tenant, including all its shards' states. + TenantDescribe { + #[arg(long)] + tenant_id: TenantId, + }, +} + +#[derive(Parser)] +#[command( + author, + version, + about, + long_about = "CLI for Storage Controller Support/Debug" +)] +#[command(arg_required_else_help(true))] +struct Cli { + #[arg(long)] + /// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local` + api: Url, + + #[arg(long)] + /// JWT token for authenticating with storage controller. Depending on the API used, this + /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint + /// a token with both scopes to use with this tool. + jwt: Option, + + #[command(subcommand)] + command: Command, +} + +#[derive(Debug, Clone)] +struct PlacementPolicyArg(PlacementPolicy); + +impl FromStr for PlacementPolicyArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "detached" => Ok(Self(PlacementPolicy::Detached)), + "secondary" => Ok(Self(PlacementPolicy::Secondary)), + _ if s.starts_with("attached:") => { + let mut splitter = s.split(':'); + let _prefix = splitter.next().unwrap(); + match splitter.next().and_then(|s| s.parse::().ok()) { + Some(n) => Ok(Self(PlacementPolicy::Attached(n))), + None => Err(anyhow::anyhow!( + "Invalid format '{s}', a valid example is 'attached:1'" + )), + } + } + _ => Err(anyhow::anyhow!( + "Unknown placement policy '{s}', try detached,secondary,attached:" + )), + } + } +} + +#[derive(Debug, Clone)] +struct ShardSchedulingPolicyArg(ShardSchedulingPolicy); + +impl FromStr for ShardSchedulingPolicyArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self(ShardSchedulingPolicy::Active)), + "essential" => Ok(Self(ShardSchedulingPolicy::Essential)), + "pause" => Ok(Self(ShardSchedulingPolicy::Pause)), + "stop" => Ok(Self(ShardSchedulingPolicy::Stop)), + _ => Err(anyhow::anyhow!( + "Unknown scheduling policy '{s}', try active,essential,pause,stop" + )), + } + } +} + +#[derive(Debug, Clone)] +struct NodeAvailabilityArg(NodeAvailabilityWrapper); + +impl FromStr for NodeAvailabilityArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self(NodeAvailabilityWrapper::Active)), + "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)), + _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")), + } + } +} + +struct Client { + base_url: Url, + jwt_token: Option, + client: reqwest::Client, +} + +impl Client { + fn new(base_url: Url, jwt_token: Option) -> Self { + Self { + base_url, + jwt_token, + client: reqwest::ClientBuilder::new() + .build() + .expect("Failed to construct http client"), + } + } + + /// Simple HTTP request wrapper for calling into storage controller + async fn dispatch( + &self, + method: hyper::Method, + path: String, + body: Option, + ) -> mgmt_api::Result + where + RQ: Serialize + Sized, + RS: DeserializeOwned + Sized, + { + // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out + // for general purpose API access. + let url = Url::from_str(&format!( + "http://{}:{}/{path}", + self.base_url.host_str().unwrap(), + self.base_url.port().unwrap() + )) + .unwrap(); + + let mut builder = self.client.request(method, url); + if let Some(body) = body { + builder = builder.json(&body) + } + if let Some(jwt_token) = &self.jwt_token { + builder = builder.header( + reqwest::header::AUTHORIZATION, + format!("Bearer {jwt_token}"), + ); + } + + let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?; + let response = response.error_from_body().await?; + + response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody) + } +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let cli = Cli::parse(); + + let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone()); + + let mut trimmed = cli.api.to_string(); + trimmed.pop(); + let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref()); + + match cli.command { + Command::NodeRegister { + node_id, + listen_pg_addr, + listen_pg_port, + listen_http_addr, + listen_http_port, + } => { + storcon_client + .dispatch::<_, ()>( + Method::POST, + "control/v1/node".to_string(), + Some(NodeRegisterRequest { + node_id, + listen_pg_addr, + listen_pg_port, + listen_http_addr, + listen_http_port, + }), + ) + .await?; + } + Command::TenantCreate { tenant_id } => { + vps_client + .tenant_create(&TenantCreateRequest { + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation: None, + shard_parameters: ShardParameters::default(), + placement_policy: Some(PlacementPolicy::Attached(1)), + config: TenantConfig::default(), + }) + .await?; + } + Command::TenantDelete { tenant_id } => { + let status = vps_client + .tenant_delete(TenantShardId::unsharded(tenant_id)) + .await?; + tracing::info!("Delete status: {}", status); + } + Command::Nodes {} => { + let resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + let mut table = comfy_table::Table::new(); + table.set_header(["Id", "Hostname", "Scheduling", "Availability"]); + for node in resp { + table.add_row([ + format!("{}", node.id), + node.listen_http_addr, + format!("{:?}", node.scheduling), + format!("{:?}", node.availability), + ]); + } + println!("{table}"); + } + Command::NodeConfigure { + node_id, + availability, + scheduling, + } => { + let req = NodeConfigureRequest { + node_id, + availability: availability.map(|a| a.0), + scheduling, + }; + storcon_client + .dispatch::<_, ()>( + Method::PUT, + format!("control/v1/node/{node_id}/config"), + Some(req), + ) + .await?; + } + Command::Tenants {} => { + let resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/tenant".to_string(), + None, + ) + .await?; + let mut table = comfy_table::Table::new(); + table.set_header([ + "TenantId", + "ShardCount", + "StripeSize", + "Placement", + "Scheduling", + ]); + for tenant in resp { + let shard_zero = tenant.shards.into_iter().next().unwrap(); + table.add_row([ + format!("{}", tenant.tenant_id), + format!("{}", shard_zero.tenant_shard_id.shard_count.literal()), + format!("{:?}", tenant.stripe_size), + format!("{:?}", tenant.policy), + format!("{:?}", shard_zero.scheduling_policy), + ]); + } + + println!("{table}"); + } + Command::TenantPolicy { + tenant_id, + placement, + scheduling, + } => { + let req = TenantPolicyRequest { + scheduling: scheduling.map(|s| s.0), + placement: placement.map(|p| p.0), + }; + storcon_client + .dispatch::<_, ()>( + Method::PUT, + format!("control/v1/tenant/{tenant_id}/policy"), + Some(req), + ) + .await?; + } + Command::TenantShardSplit { + tenant_id, + shard_count, + stripe_size, + } => { + let req = TenantShardSplitRequest { + new_shard_count: shard_count, + new_stripe_size: stripe_size.map(ShardStripeSize), + }; + + let response = storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{tenant_id}/shard_split"), + Some(req), + ) + .await?; + println!( + "Split tenant {} into {} shards: {}", + tenant_id, + shard_count, + response + .new_shards + .iter() + .map(|s| format!("{:?}", s)) + .collect::>() + .join(",") + ); + } + Command::TenantShardMigrate { + tenant_shard_id, + node, + } => { + let req = TenantShardMigrateRequest { + tenant_shard_id, + node_id: node, + }; + + storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{tenant_shard_id}/migrate"), + Some(req), + ) + .await?; + } + Command::TenantConfig { tenant_id, config } => { + let tenant_conf = serde_json::from_str(&config)?; + + vps_client + .tenant_config(&TenantConfigRequest { + tenant_id, + config: tenant_conf, + }) + .await?; + } + Command::TenantScatter { tenant_id } => { + // Find the shards + let locate_response = storcon_client + .dispatch::<(), TenantLocateResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}/locate"), + None, + ) + .await?; + let shards = locate_response.shards; + + let mut node_to_shards: HashMap> = HashMap::new(); + let shard_count = shards.len(); + for s in shards { + let entry = node_to_shards.entry(s.node_id).or_default(); + entry.push(s.shard_id); + } + + // Load list of available nodes + let nodes_resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + + for node in nodes_resp { + if matches!(node.availability, NodeAvailabilityWrapper::Active) { + node_to_shards.entry(node.id).or_default(); + } + } + + let max_shard_per_node = shard_count / node_to_shards.len(); + + loop { + let mut migrate_shard = None; + for shards in node_to_shards.values_mut() { + if shards.len() > max_shard_per_node { + // Pick the emptiest + migrate_shard = Some(shards.pop().unwrap()); + } + } + let Some(migrate_shard) = migrate_shard else { + break; + }; + + // Pick the emptiest node to migrate to + let mut destinations = node_to_shards + .iter() + .map(|(k, v)| (k, v.len())) + .collect::>(); + destinations.sort_by_key(|i| i.1); + let (destination_node, destination_count) = *destinations.first().unwrap(); + if destination_count + 1 > max_shard_per_node { + // Even the emptiest destination doesn't have space: we're done + break; + } + let destination_node = *destination_node; + + node_to_shards + .get_mut(&destination_node) + .unwrap() + .push(migrate_shard); + + println!("Migrate {} -> {} ...", migrate_shard, destination_node); + + storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{migrate_shard}/migrate"), + Some(TenantShardMigrateRequest { + tenant_shard_id: migrate_shard, + node_id: destination_node, + }), + ) + .await?; + println!("Migrate {} -> {} OK", migrate_shard, destination_node); + } + + // Spread the shards across the nodes + } + Command::TenantDescribe { tenant_id } => { + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await?; + let shards = describe_response.shards; + let mut table = comfy_table::Table::new(); + table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]); + for shard in shards { + let secondary = shard + .node_secondary + .iter() + .map(|n| format!("{}", n)) + .collect::>() + .join(","); + + let mut status_parts = Vec::new(); + if shard.is_reconciling { + status_parts.push("reconciling"); + } + + if shard.is_pending_compute_notification { + status_parts.push("pending_compute"); + } + + if shard.is_splitting { + status_parts.push("splitting"); + } + let status = status_parts.join(","); + + table.add_row([ + format!("{}", shard.tenant_shard_id), + shard + .node_attached + .map(|n| format!("{}", n)) + .unwrap_or(String::new()), + secondary, + shard.last_error, + status, + ]); + } + println!("{table}"); + } + } + + Ok(()) +} diff --git a/diesel.toml b/diesel.toml index 30ed4444d7..558c54a1e1 100644 --- a/diesel.toml +++ b/diesel.toml @@ -2,8 +2,8 @@ # see https://diesel.rs/guides/configuring-diesel-cli [print_schema] -file = "control_plane/attachment_service/src/schema.rs" +file = "storage_controller/src/schema.rs" custom_type_derives = ["diesel::query_builder::QueryId"] [migrations_directory] -dir = "control_plane/attachment_service/migrations" +dir = "storage_controller/migrations" diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 12fa80349e..3732bfdab2 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -7,6 +7,11 @@ Below you will find a brief overview of each subdir in the source tree in alphab Neon storage broker, providing messaging between safekeepers and pageservers. [storage_broker.md](./storage_broker.md) +`storage_controller`: + +Neon storage controller, manages a cluster of pageservers and exposes an API that enables +managing a many-sharded tenant as a single entity. + `/control_plane`: Local control plane. diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index f6a49a0166..0bd804051c 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -10,11 +10,13 @@ libc.workspace = true once_cell.workspace = true chrono.workspace = true twox-hash.workspace = true +measured.workspace = true workspace_hack.workspace = true [target.'cfg(target_os = "linux")'.dependencies] procfs.workspace = true +measured-process.workspace = true [dev-dependencies] rand = "0.8" diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs index dfb4461ce9..f53511ab5c 100644 --- a/libs/metrics/src/hll.rs +++ b/libs/metrics/src/hll.rs @@ -7,14 +7,19 @@ //! use significantly less memory than this, but can only approximate the cardinality. use std::{ - collections::HashMap, - hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}, - sync::{atomic::AtomicU8, Arc, RwLock}, + hash::{BuildHasher, BuildHasherDefault, Hash}, + sync::atomic::AtomicU8, }; -use prometheus::{ - core::{self, Describer}, - proto, Opts, +use measured::{ + label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, + metric::{ + group::{Encoding, MetricValue}, + name::MetricNameEncoder, + Metric, MetricType, MetricVec, + }, + text::TextEncoder, + LabelGroup, }; use twox_hash::xxh3; @@ -93,203 +98,25 @@ macro_rules! register_hll { /// ``` /// /// See for estimates on alpha -#[derive(Clone)] -pub struct HyperLogLogVec { - core: Arc>, +pub type HyperLogLogVec = MetricVec, L>; +pub type HyperLogLog = Metric>; + +pub struct HyperLogLogState { + shards: [AtomicU8; N], } - -struct HyperLogLogVecCore { - pub children: RwLock, BuildHasherDefault>>, - pub desc: core::Desc, - pub opts: Opts, -} - -impl core::Collector for HyperLogLogVec { - fn desc(&self) -> Vec<&core::Desc> { - vec![&self.core.desc] - } - - fn collect(&self) -> Vec { - let mut m = proto::MetricFamily::default(); - m.set_name(self.core.desc.fq_name.clone()); - m.set_help(self.core.desc.help.clone()); - m.set_field_type(proto::MetricType::GAUGE); - - let mut metrics = Vec::new(); - for child in self.core.children.read().unwrap().values() { - child.core.collect_into(&mut metrics); - } - m.set_metric(metrics); - - vec![m] +impl Default for HyperLogLogState { + fn default() -> Self { + #[allow(clippy::declare_interior_mutable_const)] + const ZERO: AtomicU8 = AtomicU8::new(0); + Self { shards: [ZERO; N] } } } -impl HyperLogLogVec { - /// Create a new [`HyperLogLogVec`] based on the provided - /// [`Opts`] and partitioned by the given label names. At least one label name must be - /// provided. - pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result { - assert!(N.is_power_of_two()); - let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect(); - let opts = opts.variable_labels(variable_names); - - let desc = opts.describe()?; - let v = HyperLogLogVecCore { - children: RwLock::new(HashMap::default()), - desc, - opts, - }; - - Ok(Self { core: Arc::new(v) }) - } - - /// `get_metric_with_label_values` returns the [`HyperLogLog

`] for the given slice - /// of label values (same order as the VariableLabels in Desc). If that combination of - /// label values is accessed for the first time, a new [`HyperLogLog

`] is created. - /// - /// An error is returned if the number of label values is not the same as the - /// number of VariableLabels in Desc. - pub fn get_metric_with_label_values( - &self, - vals: &[&str], - ) -> prometheus::Result> { - self.core.get_metric_with_label_values(vals) - } - - /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error - /// occurs. - pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog { - self.get_metric_with_label_values(vals).unwrap() - } +impl MetricType for HyperLogLogState { + type Metadata = (); } -impl HyperLogLogVecCore { - pub fn get_metric_with_label_values( - &self, - vals: &[&str], - ) -> prometheus::Result> { - let h = self.hash_label_values(vals)?; - - if let Some(metric) = self.children.read().unwrap().get(&h).cloned() { - return Ok(metric); - } - - self.get_or_create_metric(h, vals) - } - - pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result { - if vals.len() != self.desc.variable_labels.len() { - return Err(prometheus::Error::InconsistentCardinality { - expect: self.desc.variable_labels.len(), - got: vals.len(), - }); - } - - let mut h = xxh3::Hash64::default(); - for val in vals { - h.write(val.as_bytes()); - } - - Ok(h.finish()) - } - - fn get_or_create_metric( - &self, - hash: u64, - label_values: &[&str], - ) -> prometheus::Result> { - let mut children = self.children.write().unwrap(); - // Check exist first. - if let Some(metric) = children.get(&hash).cloned() { - return Ok(metric); - } - - let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?; - children.insert(hash, metric.clone()); - Ok(metric) - } -} - -/// HLL is a probabilistic cardinality measure. -/// -/// How to use this time-series for a metric name `my_metrics_total_hll`: -/// -/// ```promql -/// # harmonic mean -/// 1 / ( -/// sum ( -/// 2 ^ -( -/// # HLL merge operation -/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...) -/// ) -/// ) without (hll_shard) -/// ) -/// * alpha -/// * shards_count -/// * shards_count -/// ``` -/// -/// If you want an estimate over time, you can use the following query: -/// -/// ```promql -/// # harmonic mean -/// 1 / ( -/// sum ( -/// 2 ^ -( -/// # HLL merge operation -/// max ( -/// max_over_time(my_metrics_total_hll{}[$__rate_interval]) -/// ) by (hll_shard, other_labels...) -/// ) -/// ) without (hll_shard) -/// ) -/// * alpha -/// * shards_count -/// * shards_count -/// ``` -/// -/// In the case of low cardinality, you might want to use the linear counting approximation: -/// -/// ```promql -/// # LinearCounting(m, V) = m log (m / V) -/// shards_count * ln(shards_count / -/// # calculate V = how many shards contain a 0 -/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard) -/// ) -/// ``` -/// -/// See for estimates on alpha -#[derive(Clone)] -pub struct HyperLogLog { - core: Arc>, -} - -impl HyperLogLog { - /// Create a [`HyperLogLog`] with the `name` and `help` arguments. - pub fn new, S2: Into>(name: S1, help: S2) -> prometheus::Result { - assert!(N.is_power_of_two()); - let opts = Opts::new(name, help); - Self::with_opts(opts) - } - - /// Create a [`HyperLogLog`] with the `opts` options. - pub fn with_opts(opts: Opts) -> prometheus::Result { - Self::with_opts_and_label_values(&opts, &[]) - } - - fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result { - let desc = opts.describe()?; - let labels = make_label_pairs(&desc, label_values)?; - - let v = HyperLogLogCore { - shards: [0; N].map(AtomicU8::new), - desc, - labels, - }; - Ok(Self { core: Arc::new(v) }) - } - +impl HyperLogLogState { pub fn measure(&self, item: &impl Hash) { // changing the hasher will break compatibility with previous measurements. self.record(BuildHasherDefault::::default().hash_one(item)); @@ -299,42 +126,11 @@ impl HyperLogLog { let p = N.ilog2() as u8; let j = hash & (N as u64 - 1); let rho = (hash >> p).leading_zeros() as u8 + 1 - p; - self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); - } -} - -struct HyperLogLogCore { - shards: [AtomicU8; N], - desc: core::Desc, - labels: Vec, -} - -impl core::Collector for HyperLogLog { - fn desc(&self) -> Vec<&core::Desc> { - vec![&self.core.desc] + self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); } - fn collect(&self) -> Vec { - let mut m = proto::MetricFamily::default(); - m.set_name(self.core.desc.fq_name.clone()); - m.set_help(self.core.desc.help.clone()); - m.set_field_type(proto::MetricType::GAUGE); - - let mut metrics = Vec::new(); - self.core.collect_into(&mut metrics); - m.set_metric(metrics); - - vec![m] - } -} - -impl HyperLogLogCore { - fn collect_into(&self, metrics: &mut Vec) { - self.shards.iter().enumerate().for_each(|(i, x)| { - let mut shard_label = proto::LabelPair::default(); - shard_label.set_name("hll_shard".to_owned()); - shard_label.set_value(format!("{i}")); - + fn take_sample(&self) -> [u8; N] { + self.shards.each_ref().map(|x| { // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus. // This seems like it would be a race condition, @@ -344,85 +140,90 @@ impl HyperLogLogCore { // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window. // this would mean that a dev port-forwarding the metrics url won't break the sampling. - let v = x.swap(0, std::sync::atomic::Ordering::Relaxed); - - let mut m = proto::Metric::default(); - let mut c = proto::Gauge::default(); - c.set_value(v as f64); - m.set_gauge(c); - - let mut labels = Vec::with_capacity(self.labels.len() + 1); - labels.extend_from_slice(&self.labels); - labels.push(shard_label); - - m.set_label(labels); - metrics.push(m); + x.swap(0, std::sync::atomic::Ordering::Relaxed) }) } } - -fn make_label_pairs( - desc: &core::Desc, - label_values: &[&str], -) -> prometheus::Result> { - if desc.variable_labels.len() != label_values.len() { - return Err(prometheus::Error::InconsistentCardinality { - expect: desc.variable_labels.len(), - got: label_values.len(), - }); +impl measured::metric::MetricEncoding> + for HyperLogLogState +{ + fn write_type( + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + enc.write_type(&name, measured::text::MetricType::Gauge) } + fn collect_into( + &self, + _: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + struct I64(i64); + impl LabelValue for I64 { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0) + } + } - let total_len = desc.variable_labels.len() + desc.const_label_pairs.len(); - if total_len == 0 { - return Ok(vec![]); - } + struct HllShardLabel { + hll_shard: i64, + } - if desc.variable_labels.is_empty() { - return Ok(desc.const_label_pairs.clone()); - } + impl LabelGroup for HllShardLabel { + fn visit_values(&self, v: &mut impl LabelGroupVisitor) { + const LE: &LabelName = LabelName::from_str("hll_shard"); + v.write_value(LE, &I64(self.hll_shard)); + } + } - let mut label_pairs = Vec::with_capacity(total_len); - for (i, n) in desc.variable_labels.iter().enumerate() { - let mut label_pair = proto::LabelPair::default(); - label_pair.set_name(n.clone()); - label_pair.set_value(label_values[i].to_owned()); - label_pairs.push(label_pair); + self.take_sample() + .into_iter() + .enumerate() + .try_for_each(|(hll_shard, val)| { + enc.write_metric_value( + name.by_ref(), + labels.by_ref().compose_with(HllShardLabel { + hll_shard: hll_shard as i64, + }), + MetricValue::Int(val as i64), + ) + }) } - - for label_pair in &desc.const_label_pairs { - label_pairs.push(label_pair.clone()); - } - label_pairs.sort(); - Ok(label_pairs) } #[cfg(test)] mod tests { use std::collections::HashSet; - use prometheus::{proto, Opts}; + use measured::{label::StaticLabelSet, FixedCardinalityLabel}; use rand::{rngs::StdRng, Rng, SeedableRng}; use rand_distr::{Distribution, Zipf}; use crate::HyperLogLogVec; - fn collect(hll: &HyperLogLogVec<32>) -> Vec { - let mut metrics = vec![]; - hll.core - .children - .read() - .unwrap() - .values() - .for_each(|c| c.core.collect_into(&mut metrics)); - metrics + #[derive(FixedCardinalityLabel, Clone, Copy)] + #[label(singleton = "x")] + enum Label { + A, + B, } - fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 { + + fn collect(hll: &HyperLogLogVec, 32>) -> ([u8; 32], [u8; 32]) { + // cannot go through the `hll.collect_family_into` interface yet... + // need to see if I can fix the conflicting impls problem in measured. + ( + hll.get_metric(hll.with_labels(Label::A)).take_sample(), + hll.get_metric(hll.with_labels(Label::B)).take_sample(), + ) + } + + fn get_cardinality(samples: &[[u8; 32]]) -> f64 { let mut buckets = [0.0; 32]; - for metric in metrics.chunks_exact(32) { - if filter(&metric[0]) { - for (i, m) in metric.iter().enumerate() { - buckets[i] = f64::max(buckets[i], m.get_gauge().get_value()); - } + for &sample in samples { + for (i, m) in sample.into_iter().enumerate() { + buckets[i] = f64::max(buckets[i], m as f64); } } @@ -437,7 +238,7 @@ mod tests { } fn test_cardinality(n: usize, dist: impl Distribution) -> ([usize; 3], [f64; 3]) { - let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap(); + let hll = HyperLogLogVec::, 32>::new(); let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist); let mut set_a = HashSet::new(); @@ -445,18 +246,20 @@ mod tests { for x in iter.by_ref().take(n) { set_a.insert(x.to_bits()); - hll.with_label_values(&["a"]).measure(&x.to_bits()); + hll.get_metric(hll.with_labels(Label::A)) + .measure(&x.to_bits()); } for x in iter.by_ref().take(n) { set_b.insert(x.to_bits()); - hll.with_label_values(&["b"]).measure(&x.to_bits()); + hll.get_metric(hll.with_labels(Label::B)) + .measure(&x.to_bits()); } let merge = &set_a | &set_b; - let metrics = collect(&hll); - let len = get_cardinality(&metrics, |_| true); - let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a"); - let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b"); + let (a, b) = collect(&hll); + let len = get_cardinality(&[a, b]); + let len_a = get_cardinality(&[a]); + let len_b = get_cardinality(&[b]); ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b]) } diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 22b0a18933..2cf3cdeaa7 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -4,6 +4,17 @@ //! a default registry. #![deny(clippy::undocumented_unsafe_blocks)] +use measured::{ + label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}, + metric::{ + counter::CounterState, + gauge::GaugeState, + group::{Encoding, MetricValue}, + name::{MetricName, MetricNameEncoder}, + MetricEncoding, MetricFamilyEncoding, + }, + FixedCardinalityLabel, LabelGroup, MetricGroup, +}; use once_cell::sync::Lazy; use prometheus::core::{ Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, @@ -11,6 +22,7 @@ use prometheus::core::{ pub use prometheus::opts; pub use prometheus::register; pub use prometheus::Error; +use prometheus::Registry; pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_counter_vec, Counter, CounterVec}; @@ -23,13 +35,12 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec}; pub use prometheus::{register_int_gauge, IntGauge}; pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{Encoder, TextEncoder}; -use prometheus::{Registry, Result}; pub mod launch_timestamp; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; mod hll; -pub use hll::{HyperLogLog, HyperLogLogVec}; +pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec}; #[cfg(target_os = "linux")] pub mod more_process_metrics; @@ -59,7 +70,7 @@ static INTERNAL_REGISTRY: Lazy = Lazy::new(Registry::new); /// Register a collector in the internal registry. MUST be called before the first call to `gather()`. /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector /// while holding the lock. -pub fn register_internal(c: Box) -> Result<()> { +pub fn register_internal(c: Box) -> prometheus::Result<()> { INTERNAL_REGISTRY.register(c) } @@ -96,6 +107,127 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[ 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, ]; +pub struct BuildInfo { + pub revision: &'static str, + pub build_tag: &'static str, +} + +// todo: allow label group without the set +impl LabelGroup for BuildInfo { + fn visit_values(&self, v: &mut impl LabelGroupVisitor) { + const REVISION: &LabelName = LabelName::from_str("revision"); + v.write_value(REVISION, &self.revision); + const BUILD_TAG: &LabelName = LabelName::from_str("build_tag"); + v.write_value(BUILD_TAG, &self.build_tag); + } +} + +impl MetricFamilyEncoding for BuildInfo +where + GaugeState: MetricEncoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut T, + ) -> Result<(), T::Err> { + enc.write_help(&name, "Build/version information")?; + GaugeState::write_type(&name, enc)?; + GaugeState { + count: std::sync::atomic::AtomicI64::new(1), + } + .collect_into(&(), self, name, enc) + } +} + +#[derive(MetricGroup)] +#[metric(new(build_info: BuildInfo))] +pub struct NeonMetrics { + #[cfg(target_os = "linux")] + #[metric(namespace = "process")] + #[metric(init = measured_process::ProcessCollector::for_self())] + process: measured_process::ProcessCollector, + + #[metric(namespace = "libmetrics")] + #[metric(init = LibMetrics::new(build_info))] + libmetrics: LibMetrics, +} + +#[derive(MetricGroup)] +#[metric(new(build_info: BuildInfo))] +pub struct LibMetrics { + #[metric(init = build_info)] + build_info: BuildInfo, + + #[metric(flatten)] + rusage: Rusage, + + serve_count: CollectionCounter, +} + +fn write_gauge( + x: i64, + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Enc, +) -> Result<(), Enc::Err> { + enc.write_metric_value(name, labels, MetricValue::Int(x)) +} + +#[derive(Default)] +struct Rusage; + +#[derive(FixedCardinalityLabel, Clone, Copy)] +#[label(singleton = "io_operation")] +enum IoOp { + Read, + Write, +} + +impl MetricGroup for Rusage +where + GaugeState: MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total"); + const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb"); + + let ru = get_rusage_stats(); + + enc.write_help( + DISK_IO, + "Bytes written and read from disk, grouped by the operation (read|write)", + )?; + GaugeState::write_type(DISK_IO, enc)?; + write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?; + write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?; + + enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?; + GaugeState::write_type(MAXRSS, enc)?; + write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?; + + Ok(()) + } +} + +#[derive(Default)] +struct CollectionCounter(CounterState); + +impl MetricFamilyEncoding for CollectionCounter +where + CounterState: MetricEncoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut T, + ) -> Result<(), T::Err> { + self.0.inc(); + enc.write_help(&name, "Number of metric requests made")?; + self.0.collect_into(&(), NoLabels, name, enc) + } +} + pub fn set_build_info_metric(revision: &str, build_tag: &str) { let metric = register_int_gauge_vec!( "libmetrics_build_info", @@ -105,6 +237,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { .expect("Failed to register build info metric"); metric.with_label_values(&[revision, build_tag]).set(1); } +const BYTES_IN_BLOCK: i64 = 512; // Records I/O stats in a "cross-platform" way. // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats. @@ -117,7 +250,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { fn update_rusage_metrics() { let rusage_stats = get_rusage_stats(); - const BYTES_IN_BLOCK: i64 = 512; DISK_IO_BYTES .with_label_values(&["read"]) .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK); @@ -151,6 +283,7 @@ macro_rules! register_int_counter_pair_vec { } }}; } + /// Create an [`IntCounterPair`] and registers to default registry. #[macro_export(local_inner_macros)] macro_rules! register_int_counter_pair { @@ -188,7 +321,10 @@ impl GenericCounterPairVec

{ /// /// An error is returned if the number of label values is not the same as the /// number of VariableLabels in Desc. - pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result> { + pub fn get_metric_with_label_values( + &self, + vals: &[&str], + ) -> prometheus::Result> { Ok(GenericCounterPair { inc: self.inc.get_metric_with_label_values(vals)?, dec: self.dec.get_metric_with_label_values(vals)?, @@ -201,7 +337,7 @@ impl GenericCounterPairVec

{ self.get_metric_with_label_values(vals).unwrap() } - pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) { + pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) { res[0] = self.inc.remove_label_values(vals); res[1] = self.dec.remove_label_values(vals); } @@ -285,3 +421,171 @@ pub type IntCounterPair = GenericCounterPair; /// A guard for [`IntCounterPair`] that will decrement the gauge on drop pub type IntCounterPairGuard = GenericCounterPairGuard; + +pub trait CounterPairAssoc { + const INC_NAME: &'static MetricName; + const DEC_NAME: &'static MetricName; + + const INC_HELP: &'static str; + const DEC_HELP: &'static str; + + type LabelGroupSet: LabelGroupSet; +} + +pub struct CounterPairVec { + vec: measured::metric::MetricVec, +} + +impl Default for CounterPairVec +where + A::LabelGroupSet: Default, +{ + fn default() -> Self { + Self { + vec: Default::default(), + } + } +} + +impl CounterPairVec { + pub fn guard( + &self, + labels: ::Group<'_>, + ) -> MeasuredCounterPairGuard<'_, A> { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).inc.inc(); + MeasuredCounterPairGuard { vec: &self.vec, id } + } + pub fn inc(&self, labels: ::Group<'_>) { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).inc.inc(); + } + pub fn dec(&self, labels: ::Group<'_>) { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).dec.inc(); + } + pub fn remove_metric( + &self, + labels: ::Group<'_>, + ) -> Option { + let id = self.vec.with_labels(labels); + self.vec.remove_metric(id) + } +} + +impl ::measured::metric::group::MetricGroup for CounterPairVec +where + T: ::measured::metric::group::Encoding, + A: CounterPairAssoc, + ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + // write decrement first to avoid a race condition where inc - dec < 0 + T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?; + self.vec + .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?; + + T::write_help(enc, A::INC_NAME, A::INC_HELP)?; + self.vec + .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?; + + Ok(()) + } +} + +#[derive(MetricGroup, Default)] +pub struct MeasuredCounterPairState { + pub inc: CounterState, + pub dec: CounterState, +} + +impl measured::metric::MetricType for MeasuredCounterPairState { + type Metadata = (); +} + +pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> { + vec: &'a measured::metric::MetricVec, + id: measured::metric::LabelId, +} + +impl Drop for MeasuredCounterPairGuard<'_, A> { + fn drop(&mut self) { + self.vec.get_metric(self.id).dec.inc(); + } +} + +/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder. +struct Inc(T); +/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder. +struct Dec(T); + +impl Encoding for Inc { + type Err = T::Err; + + fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { + self.0.write_help(name, help) + } + + fn write_metric_value( + &mut self, + name: impl MetricNameEncoder, + labels: impl LabelGroup, + value: MetricValue, + ) -> Result<(), Self::Err> { + self.0.write_metric_value(name, labels, value) + } +} + +impl MetricEncoding> for MeasuredCounterPairState +where + CounterState: MetricEncoding, +{ + fn write_type(name: impl MetricNameEncoder, enc: &mut Inc) -> Result<(), T::Err> { + CounterState::write_type(name, &mut enc.0) + } + fn collect_into( + &self, + metadata: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Inc, + ) -> Result<(), T::Err> { + self.inc.collect_into(metadata, labels, name, &mut enc.0) + } +} + +impl Encoding for Dec { + type Err = T::Err; + + fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { + self.0.write_help(name, help) + } + + fn write_metric_value( + &mut self, + name: impl MetricNameEncoder, + labels: impl LabelGroup, + value: MetricValue, + ) -> Result<(), Self::Err> { + self.0.write_metric_value(name, labels, value) + } +} + +/// Write the dec counter to the encoder +impl MetricEncoding> for MeasuredCounterPairState +where + CounterState: MetricEncoding, +{ + fn write_type(name: impl MetricNameEncoder, enc: &mut Dec) -> Result<(), T::Err> { + CounterState::write_type(name, &mut enc.0) + } + fn collect_into( + &self, + metadata: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Dec, + ) -> Result<(), T::Err> { + self.dec.collect_into(metadata, labels, name, &mut enc.0) + } +} diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index e33bd0f486..1278f17ad2 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -2,9 +2,9 @@ use std::str::FromStr; /// Request/response types for the storage controller /// API (`/control/v1` prefix). Implemented by the server -/// in [`attachment_service::http`] +/// in [`storage_controller::http`] use serde::{Deserialize, Serialize}; -use utils::id::NodeId; +use utils::id::{NodeId, TenantId}; use crate::{ models::{ShardParameters, TenantConfig}, @@ -42,6 +42,12 @@ pub struct NodeConfigureRequest { pub scheduling: Option, } +#[derive(Serialize, Deserialize)] +pub struct TenantPolicyRequest { + pub placement: Option, + pub scheduling: Option, +} + #[derive(Serialize, Deserialize, Debug)] pub struct TenantLocateResponseShard { pub shard_id: TenantShardId, @@ -62,12 +68,27 @@ pub struct TenantLocateResponse { #[derive(Serialize, Deserialize)] pub struct TenantDescribeResponse { + pub tenant_id: TenantId, pub shards: Vec, pub stripe_size: ShardStripeSize, pub policy: PlacementPolicy, pub config: TenantConfig, } +#[derive(Serialize, Deserialize)] +pub struct NodeDescribeResponse { + pub id: NodeId, + + pub availability: NodeAvailabilityWrapper, + pub scheduling: NodeSchedulingPolicy, + + pub listen_http_addr: String, + pub listen_http_port: u16, + + pub listen_pg_addr: String, + pub listen_pg_port: u16, +} + #[derive(Serialize, Deserialize)] pub struct TenantDescribeResponseShard { pub tenant_shard_id: TenantShardId, @@ -83,6 +104,8 @@ pub struct TenantDescribeResponseShard { pub is_pending_compute_notification: bool, /// A shard split is currently underway pub is_splitting: bool, + + pub scheduling_policy: ShardSchedulingPolicy, } /// Explicitly migrating a particular shard is a low level operation @@ -97,7 +120,7 @@ pub struct TenantShardMigrateRequest { /// Utilisation score indicating how good a candidate a pageserver /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`]. /// Lower values are better. -#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)] pub struct UtilizationScore(pub u64); impl UtilizationScore { @@ -106,7 +129,7 @@ impl UtilizationScore { } } -#[derive(Serialize, Clone, Copy)] +#[derive(Serialize, Deserialize, Clone, Copy, Debug)] #[serde(into = "NodeAvailabilityWrapper")] pub enum NodeAvailability { // Normal, happy state @@ -129,7 +152,7 @@ impl Eq for NodeAvailability {} // This wrapper provides serde functionality and it should only be used to // communicate with external callers which don't know or care about the // utilisation score of the pageserver it is targeting. -#[derive(Serialize, Deserialize, Clone)] +#[derive(Serialize, Deserialize, Clone, Copy, Debug)] pub enum NodeAvailabilityWrapper { Active, Offline, @@ -155,22 +178,33 @@ impl From for NodeAvailabilityWrapper { } } -impl FromStr for NodeAvailability { - type Err = anyhow::Error; +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] +pub enum ShardSchedulingPolicy { + // Normal mode: the tenant's scheduled locations may be updated at will, including + // for non-essential optimization. + Active, - fn from_str(s: &str) -> Result { - match s { - // This is used when parsing node configuration requests from neon-local. - // Assume the worst possible utilisation score - // and let it get updated via the heartbeats. - "active" => Ok(Self::Active(UtilizationScore::worst())), - "offline" => Ok(Self::Offline), - _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")), - } + // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy. + // For example, this still permits a node's attachment location to change to a secondary in + // response to a node failure, or to assign a new secondary if a node was removed. + Essential, + + // No scheduling: leave the shard running wherever it currently is. Even if the shard is + // unavailable, it will not be rescheduled to another node. + Pause, + + // No reconciling: we will make no location_conf API calls to pageservers at all. If the + // shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over. + Stop, +} + +impl Default for ShardSchedulingPolicy { + fn default() -> Self { + Self::Active } } -#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] pub enum NodeSchedulingPolicy { Active, Filling, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index aad4cc97fc..f441d1ff1a 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -20,6 +20,7 @@ use utils::{ history_buffer::HistoryBufferWithDropCounter, id::{NodeId, TenantId, TimelineId}, lsn::Lsn, + serde_system_time, }; use crate::controller_api::PlacementPolicy; @@ -301,6 +302,7 @@ pub struct TenantConfig { pub heatmap_period: Option, pub lazy_slru_download: Option, pub timeline_get_throttle: Option, + pub image_layer_creation_check_threshold: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -745,10 +747,18 @@ pub struct TimelineGcRequest { pub gc_horizon: Option, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalRedoManagerProcessStatus { + pub pid: u32, + /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`. + /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`. + pub kind: Cow<'static, str>, +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct WalRedoManagerStatus { pub last_redo_at: Option>, - pub pid: Option, + pub process: Option, } /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating @@ -757,11 +767,7 @@ pub struct WalRedoManagerStatus { #[derive(Default, Debug, Serialize, Deserialize, Clone)] pub struct SecondaryProgress { /// The remote storage LastModified time of the heatmap object we last downloaded. - #[serde( - serialize_with = "opt_ser_rfc3339_millis", - deserialize_with = "opt_deser_rfc3339_millis" - )] - pub heatmap_mtime: Option, + pub heatmap_mtime: Option, /// The number of layers currently on-disk pub layers_downloaded: usize, @@ -774,29 +780,6 @@ pub struct SecondaryProgress { pub bytes_total: u64, } -fn opt_ser_rfc3339_millis( - ts: &Option, - serializer: S, -) -> Result { - match ts { - Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)), - None => serializer.serialize_none(), - } -} - -fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result, D::Error> -where - D: serde::de::Deserializer<'de>, -{ - let s: Option = serde::de::Deserialize::deserialize(deserializer)?; - match s { - None => Ok(None), - Some(s) => humantime::parse_rfc3339(&s) - .map_err(serde::de::Error::custom) - .map(Some), - } -} - pub mod virtual_file { #[derive( Copy, diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index f5984dff5d..e88cab5d6a 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -1,4 +1,4 @@ -use std::time::SystemTime; +use utils::serde_system_time::SystemTime; /// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// the next tenant. @@ -21,28 +21,9 @@ pub struct PageserverUtilization { /// When was this snapshot captured, pageserver local time. /// /// Use millis to give confidence that the value is regenerated often enough. - #[serde( - serialize_with = "ser_rfc3339_millis", - deserialize_with = "deser_rfc3339_millis" - )] pub captured_at: SystemTime, } -fn ser_rfc3339_millis( - ts: &SystemTime, - serializer: S, -) -> Result { - serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) -} - -fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result -where - D: serde::de::Deserializer<'de>, -{ - let s: String = serde::de::Deserialize::deserialize(deserializer)?; - humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom) -} - /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. /// /// Instead of newtype, use this because a newtype would get require handling deserializing values @@ -69,7 +50,9 @@ mod tests { disk_usage_bytes: u64::MAX, free_space_bytes: 0, utilization_score: u64::MAX, - captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), + captured_at: SystemTime( + std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), + ), }; let s = serde_json::to_string(&doc).unwrap(); diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index a2a9165184..c293ad705b 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -8,12 +8,89 @@ use hex::FromHex; use serde::{Deserialize, Serialize}; use utils::id::TenantId; +/// See docs/rfcs/031-sharding-static.md for an overview of sharding. +/// +/// This module contains a variety of types used to represent the concept of sharding +/// a Neon tenant across multiple physical shards. Since there are quite a few of these, +/// we provide an summary here. +/// +/// Types used to describe shards: +/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value +/// which identifies a tenant which is not shard-aware. This means its storage paths do not include +/// a shard suffix. +/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant. +/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId` +/// without the tenant ID. This is useful for things that are implicitly scoped to a particular +/// tenant, such as layer files. +/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient +/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read. +/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as +/// four hex digits. An unsharded tenant is `0000`. +/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant +/// +/// Types used to describe the parameters for data distribution in a sharded tenant: +/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across +/// multiple shards. Its value is given in 8kiB pages. +/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is +/// always zero: this is provided for future upgrades that might introduce different +/// data distribution schemes. +/// +/// Examples: +/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000 +/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001 +/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive), +/// and their slugs are 0004, 0104, 0204, and 0304. + #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardNumber(pub u8); #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardCount(u8); +/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant, +/// when we need to know which shard we're dealing with, but do not need to know the full +/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know +/// the fully qualified TenantShardId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct ShardIndex { + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + +/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`], +/// and to check whether that [`ShardNumber`] is the same as the current shard. +#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] +pub struct ShardIdentity { + pub number: ShardNumber, + pub count: ShardCount, + pub stripe_size: ShardStripeSize, + layout: ShardLayout, +} + +/// Formatting helper, for generating the `shard_id` label in traces. +struct ShardSlug<'a>(&'a TenantShardId); + +/// TenantShardId globally identifies a particular shard in a particular tenant. +/// +/// These are written as `-`, for example: +/// # The second shard in a two-shard tenant +/// 072f1291a5310026820b2fe4b2968934-0102 +/// +/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without +/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables +/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`]. +/// +/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs, +/// is both forward and backward compatible with TenantId: a legacy TenantId can be +/// decoded as a TenantShardId, and when re-encoded it will be parseable +/// as a TenantId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct TenantShardId { + pub tenant_id: TenantId, + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + impl ShardCount { pub const MAX: Self = Self(u8::MAX); @@ -38,6 +115,7 @@ impl ShardCount { self.0 } + /// pub fn is_unsharded(&self) -> bool { self.0 == 0 } @@ -53,33 +131,6 @@ impl ShardNumber { pub const MAX: Self = Self(u8::MAX); } -/// TenantShardId identify the units of work for the Pageserver. -/// -/// These are written as `-`, for example: -/// -/// # The second shard in a two-shard tenant -/// 072f1291a5310026820b2fe4b2968934-0102 -/// -/// Historically, tenants could not have multiple shards, and were identified -/// by TenantId. To support this, TenantShardId has a special legacy -/// mode where `shard_count` is equal to zero: this represents a single-sharded -/// tenant which should be written as a TenantId with no suffix. -/// -/// The human-readable encoding of TenantShardId, such as used in API URLs, -/// is both forward and backward compatible: a legacy TenantId can be -/// decoded as a TenantShardId, and when re-encoded it will be parseable -/// as a TenantId. -/// -/// Note that the binary encoding is _not_ backward compatible, because -/// at the time sharding is introduced, there are no existing binary structures -/// containing TenantId that we need to handle. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct TenantShardId { - pub tenant_id: TenantId, - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} - impl TenantShardId { pub fn unsharded(tenant_id: TenantId) -> Self { Self { @@ -111,10 +162,13 @@ impl TenantShardId { } /// Convenience for code that has special behavior on the 0th shard. - pub fn is_zero(&self) -> bool { + pub fn is_shard_zero(&self) -> bool { self.shard_number == ShardNumber(0) } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() } @@ -150,9 +204,6 @@ impl TenantShardId { } } -/// Formatting helper -struct ShardSlug<'a>(&'a TenantShardId); - impl<'a> std::fmt::Display for ShardSlug<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -222,16 +273,6 @@ impl From<[u8; 18]> for TenantShardId { } } -/// For use within the context of a particular tenant, when we need to know which -/// shard we're dealing with, but do not need to know the full ShardIdentity (because -/// we won't be doing any page->shard mapping), and do not need to know the fully qualified -/// TenantShardId. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct ShardIndex { - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} - impl ShardIndex { pub fn new(number: ShardNumber, count: ShardCount) -> Self { Self { @@ -246,6 +287,9 @@ impl ShardIndex { } } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) } @@ -313,6 +357,8 @@ impl Serialize for TenantShardId { if serializer.is_human_readable() { serializer.collect_str(self) } else { + // Note: while human encoding of [`TenantShardId`] is backward and forward + // compatible, this binary encoding is not. let mut packed: [u8; 18] = [0; 18]; packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); packed[16] = self.shard_number.0; @@ -390,16 +436,6 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255); /// Default stripe size in pages: 256MiB divided by 8kiB page size. const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); -/// The ShardIdentity contains the information needed for one member of map -/// to resolve a key to a shard, and then check whether that shard is ==self. -#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] -pub struct ShardIdentity { - pub number: ShardNumber, - pub count: ShardCount, - pub stripe_size: ShardStripeSize, - layout: ShardLayout, -} - #[derive(thiserror::Error, Debug, PartialEq, Eq)] pub enum ShardConfigError { #[error("Invalid shard count")] @@ -439,6 +475,9 @@ impl ShardIdentity { } } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.number == ShardNumber(0) && self.count == ShardCount(0) } @@ -487,6 +526,8 @@ impl ShardIdentity { } /// Return true if the key should be ingested by this shard + /// + /// Shards must ingest _at least_ keys which return true from this check. pub fn is_key_local(&self, key: &Key) -> bool { assert!(!self.is_broken()); if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) { @@ -497,7 +538,9 @@ impl ShardIdentity { } /// Return true if the key should be discarded if found in this shard's - /// data store, e.g. during compaction after a split + /// data store, e.g. during compaction after a split. + /// + /// Shards _may_ drop keys which return false here, but are not obliged to. pub fn is_key_disposable(&self, key: &Key) -> bool { if key_is_shard0(key) { // Q: Why can't we dispose of shard0 content if we're not shard 0? @@ -523,7 +566,7 @@ impl ShardIdentity { /// Convenience for checking if this identity is the 0th shard in a tenant, /// for special cases on shard 0 such as ingesting relation sizes. - pub fn is_zero(&self) -> bool { + pub fn is_shard_zero(&self) -> bool { self.number == ShardNumber(0) } } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index ab2035f19a..e708854be2 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -565,6 +565,16 @@ impl GenericRemoteStorage { #[derive(Debug, Clone, PartialEq, Eq)] pub struct StorageMetadata(HashMap); +impl From<[(&str, &str); N]> for StorageMetadata { + fn from(arr: [(&str, &str); N]) -> Self { + let map: HashMap = arr + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + Self(map) + } +} + /// External backup storage configuration, enough for creating a client for that storage. #[derive(Debug, Clone, PartialEq, Eq)] pub struct RemoteStorageConfig { diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 6adddf52a9..6aa02868e6 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -57,7 +57,6 @@ enum MaybeEnabledStorage { Disabled, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorage { async fn setup() -> Self { ensure_logging_ready(); @@ -86,7 +85,6 @@ struct AzureWithTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { async fn setup() -> Self { ensure_logging_ready(); @@ -148,7 +146,6 @@ struct AzureWithSimpleTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { async fn setup() -> Self { ensure_logging_ready(); diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index bc5e40e70f..c5d5216f00 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -219,7 +219,6 @@ enum MaybeEnabledStorage { Disabled, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorage { async fn setup() -> Self { ensure_logging_ready(); @@ -248,7 +247,6 @@ struct S3WithTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { async fn setup() -> Self { ensure_logging_ready(); @@ -310,7 +308,6 @@ struct S3WithSimpleTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { async fn setup() -> Self { ensure_logging_ready(); diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index c2d9d9d396..a6a081c5c1 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -22,6 +22,7 @@ camino.workspace = true chrono.workspace = true heapless.workspace = true hex = { workspace = true, features = ["serde"] } +humantime.workspace = true hyper = { workspace = true, features = ["full"] } fail.workspace = true futures = { workspace = true} diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs new file mode 100644 index 0000000000..b3e326bfd0 --- /dev/null +++ b/libs/utils/src/env.rs @@ -0,0 +1,21 @@ +//! Wrapper around `std::env::var` for parsing environment variables. + +use std::{fmt::Display, str::FromStr}; + +pub fn var(varname: &str) -> Option +where + V: FromStr, + E: Display, +{ + match std::env::var(varname) { + Ok(s) => Some( + s.parse() + .map_err(|e| format!("failed to parse env var {varname}: {e:#}")) + .unwrap(), + ), + Err(std::env::VarError::NotPresent) => None, + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var {varname} is not unicode") + } + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 04ce0626c8..2953f0aad4 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -63,6 +63,7 @@ pub mod measured_stream; pub mod serde_percent; pub mod serde_regex; +pub mod serde_system_time; pub mod pageserver_feedback; @@ -89,6 +90,10 @@ pub mod yielding_loop; pub mod zstd; +pub mod env; + +pub mod poison; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs new file mode 100644 index 0000000000..0bf5664f47 --- /dev/null +++ b/libs/utils/src/poison.rs @@ -0,0 +1,121 @@ +//! Protect a piece of state from reuse after it is left in an inconsistent state. +//! +//! # Example +//! +//! ``` +//! # tokio_test::block_on(async { +//! use utils::poison::Poison; +//! use std::time::Duration; +//! +//! struct State { +//! clean: bool, +//! } +//! let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true })); +//! +//! let mut mutex_guard = state.lock().await; +//! let mut poison_guard = mutex_guard.check_and_arm()?; +//! let state = poison_guard.data_mut(); +//! state.clean = false; +//! // If we get cancelled at this await point, subsequent check_and_arm() calls will fail. +//! tokio::time::sleep(Duration::from_secs(10)).await; +//! state.clean = true; +//! poison_guard.disarm(); +//! # Ok::<(), utils::poison::Error>(()) +//! # }); +//! ``` + +use tracing::warn; + +pub struct Poison { + what: &'static str, + state: State, + data: T, +} + +#[derive(Clone, Copy)] +enum State { + Clean, + Armed, + Poisoned { at: chrono::DateTime }, +} + +impl Poison { + /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed. + pub fn new(what: &'static str, data: T) -> Self { + Self { + what, + state: State::Clean, + data, + } + } + + /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state. + pub fn check_and_arm(&mut self) -> Result, Error> { + match self.state { + State::Clean => { + self.state = State::Armed; + Ok(Guard(self)) + } + State::Armed => unreachable!("transient state"), + State::Poisoned { at } => Err(Error::Poisoned { + what: self.what, + at, + }), + } + } +} + +/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state. +/// Once modifications are done, use [`Self::disarm`]. +/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned +/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error. +pub struct Guard<'a, T>(&'a mut Poison); + +impl<'a, T> Guard<'a, T> { + pub fn data(&self) -> &T { + &self.0.data + } + pub fn data_mut(&mut self) -> &mut T { + &mut self.0.data + } + + pub fn disarm(self) { + match self.0.state { + State::Clean => unreachable!("we set it to Armed in check_and_arm()"), + State::Armed => { + self.0.state = State::Clean; + } + State::Poisoned { at } => { + unreachable!("we fail check_and_arm() if it's in that state: {at}") + } + } + } +} + +impl<'a, T> Drop for Guard<'a, T> { + fn drop(&mut self) { + match self.0.state { + State::Clean => { + // set by disarm() + } + State::Armed => { + // still armed => poison it + let at = chrono::Utc::now(); + self.0.state = State::Poisoned { at }; + warn!(at=?at, "poisoning {}", self.0.what); + } + State::Poisoned { at } => { + unreachable!("we fail check_and_arm() if it's in that state: {at}") + } + } + } +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("poisoned at {at}: {what}")] + Poisoned { + what: &'static str, + at: chrono::DateTime, + }, +} diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index b7301776eb..0544c5be03 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -182,6 +182,18 @@ where } } + /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`. + pub fn would_wait_for(&self, num: V) -> Result<(), V> { + let internal = self.internal.lock().unwrap(); + let cnt = internal.current.cnt_value(); + drop(internal); + if cnt >= num { + Ok(()) + } else { + Err(cnt) + } + } + /// Register and return a channel that will be notified when a number arrives, /// or None, if it has already arrived. fn queue_for_wait(&self, num: V) -> Result>, SeqWaitError> { diff --git a/libs/utils/src/serde_system_time.rs b/libs/utils/src/serde_system_time.rs new file mode 100644 index 0000000000..b0f6934e87 --- /dev/null +++ b/libs/utils/src/serde_system_time.rs @@ -0,0 +1,55 @@ +//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision. + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct SystemTime( + #[serde( + deserialize_with = "deser_rfc3339_millis", + serialize_with = "ser_rfc3339_millis" + )] + pub std::time::SystemTime, +); + +fn ser_rfc3339_millis( + ts: &std::time::SystemTime, + serializer: S, +) -> Result { + serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) +} + +fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let s: String = serde::de::Deserialize::deserialize(deserializer)?; + humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds. + fn to_millisecond_precision(time: SystemTime) -> SystemTime { + match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) { + Ok(duration) => { + let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis()); + SystemTime( + std::time::SystemTime::UNIX_EPOCH + + std::time::Duration::from_millis(total_millis), + ) + } + Err(_) => time, + } + } + + #[test] + fn test_serialize_deserialize() { + let input = SystemTime(std::time::SystemTime::now()); + let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0)); + let serialized = serde_json::to_string(&input).unwrap(); + assert_eq!(expected_serialized, serialized); + let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap(); + assert_eq!(to_millisecond_precision(input), deserialized); + } +} diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 3efad546a6..5b871c5d5e 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -27,30 +27,50 @@ //! //! # Reference Numbers //! -//! 2024-03-20 on i3en.3xlarge +//! 2024-04-15 on i3en.3xlarge //! //! ```text -//! short/1 time: [26.483 µs 26.614 µs 26.767 µs] -//! short/2 time: [32.223 µs 32.465 µs 32.767 µs] -//! short/4 time: [47.203 µs 47.583 µs 47.984 µs] -//! short/8 time: [89.135 µs 89.612 µs 90.139 µs] -//! short/16 time: [190.12 µs 191.52 µs 192.88 µs] -//! short/32 time: [380.96 µs 382.63 µs 384.20 µs] -//! short/64 time: [736.86 µs 741.07 µs 745.03 µs] -//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms] -//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs] -//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs] -//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs] -//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs] -//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms] -//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms] -//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms] -//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms] +//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs] +//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs] +//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs] +//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs] +//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs] +//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs] +//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs] +//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms] +//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs] +//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs] +//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs] +//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs] +//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms] +//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms] +//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms] +//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms] +//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs] +//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs] +//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs] +//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs] +//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs] +//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs] +//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs] +//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms] +//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs] +//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs] +//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs] +//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs] +//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms] +//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms] +//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms] +//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms] //! ``` use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; -use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager}; +use pageserver::{ + config::PageServerConf, + walrecord::NeonWalRecord, + walredo::{PostgresRedoManager, ProcessKind}, +}; use pageserver_api::{key::Key, shard::TenantShardId}; use std::{ sync::Arc, @@ -60,33 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet}; use utils::{id::TenantId, lsn::Lsn}; fn bench(c: &mut Criterion) { - { - let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; - for nclients in nclients { - let mut group = c.benchmark_group("short"); - group.bench_with_input( - BenchmarkId::from_parameter(nclients), - &nclients, - |b, nclients| { - let redo_work = Arc::new(Request::short_input()); - b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); - }, - ); + for process_kind in &[ProcessKind::Async, ProcessKind::Sync] { + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group(format!("{process_kind}-short")); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::short_input()); + b.iter_custom(|iters| { + bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients) + }); + }, + ); + } } - } - { - let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; - for nclients in nclients { - let mut group = c.benchmark_group("medium"); - group.bench_with_input( - BenchmarkId::from_parameter(nclients), - &nclients, - |b, nclients| { - let redo_work = Arc::new(Request::medium_input()); - b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); - }, - ); + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group(format!("{process_kind}-medium")); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::medium_input()); + b.iter_custom(|iters| { + bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients) + }); + }, + ); + } } } } @@ -94,10 +120,16 @@ criterion::criterion_group!(benches, bench); criterion::criterion_main!(benches); // Returns the sum of each client's wall-clock time spent executing their share of the n_redos. -fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration { +fn bench_impl( + process_kind: ProcessKind, + redo_work: Arc, + n_redos: u64, + nclients: u64, +) -> Duration { let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap(); - let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); + let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); + conf.walredo_process_kind = process_kind; let conf = Box::leak(Box::new(conf)); let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); @@ -113,25 +145,40 @@ fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration let manager = PostgresRedoManager::new(conf, tenant_shard_id); let manager = Arc::new(manager); + // divide the amount of work equally among the clients. + let nredos_per_client = n_redos / nclients; for _ in 0..nclients { rt.block_on(async { tasks.spawn(client( Arc::clone(&manager), Arc::clone(&start), Arc::clone(&redo_work), - // divide the amount of work equally among the clients - n_redos / nclients, + nredos_per_client, )) }); } - rt.block_on(async move { - let mut total_wallclock_time = std::time::Duration::from_millis(0); + let elapsed = rt.block_on(async move { + let mut total_wallclock_time = Duration::ZERO; while let Some(res) = tasks.join_next().await { total_wallclock_time += res.unwrap(); } total_wallclock_time - }) + }); + + // consistency check to ensure process kind setting worked + if nredos_per_client > 0 { + assert_eq!( + manager + .status() + .process + .map(|p| p.kind) + .expect("the benchmark work causes a walredo process to be spawned"), + std::borrow::Cow::Borrowed(process_kind.into()) + ); + } + + elapsed } async fn client( diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index ab55d2b0a3..3c9982ffb8 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -128,12 +128,12 @@ impl Client { pub async fn timeline_info( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, force_await_logical_size: ForceAwaitLogicalSize, ) -> Result { let uri = format!( - "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}", + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}", self.mgmt_api_endpoint ); @@ -151,11 +151,11 @@ impl Client { pub async fn keyspace( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result { let uri = format!( - "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace", + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace", self.mgmt_api_endpoint ); self.get(&uri) diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml index 47f318db63..0fd1d81845 100644 --- a/pageserver/compaction/Cargo.toml +++ b/pageserver/compaction/Cargo.toml @@ -11,7 +11,6 @@ default = [] anyhow.workspace = true async-compression.workspace = true async-stream.workspace = true -async-trait.workspace = true byteorder.workspace = true bytes.workspace = true chrono = { workspace = true, features = ["serde"] } diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 60fc7ac925..5261746b22 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -43,7 +43,8 @@ pub async fn compact_tiered( fanout: u64, ctx: &E::RequestContext, ) -> anyhow::Result<()> { - assert!(fanout >= 2); + assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}"); + let exp_base = fanout.max(2); // Start at L0 let mut current_level_no = 0; let mut current_level_target_height = target_file_size; @@ -106,7 +107,7 @@ pub async fn compact_tiered( break; } current_level_no += 1; - current_level_target_height = current_level_target_height.saturating_mul(fanout); + current_level_target_height = current_level_target_height.saturating_mul(exp_base); } Ok(()) } diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 22a410b4af..9de6363d6e 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -180,7 +180,7 @@ where match top.deref_mut() { LazyLoadLayer::Unloaded(ref mut l) => { let fut = l.load_keys(this.ctx); - this.load_future.set(Some(fut)); + this.load_future.set(Some(Box::pin(fut))); continue; } LazyLoadLayer::Loaded(ref mut entries) => { diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 2bb2e749c0..5dc62e506f 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -3,7 +3,6 @@ //! //! All the heavy lifting is done by the create_image and create_delta //! functions that the implementor provides. -use async_trait::async_trait; use futures::Future; use pageserver_api::{key::Key, keyspace::key_range_size}; use std::ops::Range; @@ -141,18 +140,16 @@ pub trait CompactionLayer { fn is_delta(&self) -> bool; } - -#[async_trait] pub trait CompactionDeltaLayer: CompactionLayer { type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key> where Self: 'a; /// Return all keys in this delta layer. - async fn load_keys<'a>( + fn load_keys<'a>( &self, ctx: &E::RequestContext, - ) -> anyhow::Result>>; + ) -> impl Future>>> + Send; } pub trait CompactionImageLayer: CompactionLayer {} diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index def7983e75..6c00df3a65 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -2,7 +2,6 @@ mod draw; use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; -use async_trait::async_trait; use futures::StreamExt; use rand::Rng; use tracing::info; @@ -139,7 +138,6 @@ impl interface::CompactionLayer for Arc { } } -#[async_trait] impl interface::CompactionDeltaLayer for Arc { type DeltaEntry<'a> = MockRecord; diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index c5cd451e8d..843f5dd862 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -12,9 +12,14 @@ bytes.workspace = true camino.workspace = true clap = { workspace = true, features = ["string"] } git-version.workspace = true +humantime.workspace = true pageserver = { path = ".." } +pageserver_api.workspace = true +remote_storage = { path = "../../libs/remote_storage" } postgres_ffi.workspace = true tokio.workspace = true +tokio-util.workspace = true +toml_edit.workspace = true utils.workspace = true svg_fmt.workspace = true workspace_hack.workspace = true diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index e73d961e36..1fb75584fc 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -9,6 +9,11 @@ mod index_part; mod layer_map_analyzer; mod layers; +use std::{ + str::FromStr, + time::{Duration, SystemTime}, +}; + use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; use index_part::IndexPartCmd; @@ -20,8 +25,16 @@ use pageserver::{ tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, virtual_file, }; +use pageserver_api::shard::TenantShardId; use postgres_ffi::ControlFileData; -use utils::{lsn::Lsn, project_git_version}; +use remote_storage::{RemotePath, RemoteStorageConfig}; +use tokio_util::sync::CancellationToken; +use utils::{ + id::TimelineId, + logging::{self, LogFormat, TracingErrorLayerEnablement}, + lsn::Lsn, + project_git_version, +}; project_git_version!(GIT_VERSION); @@ -43,6 +56,7 @@ enum Commands { #[command(subcommand)] IndexPart(IndexPartCmd), PrintLayerFile(PrintLayerFileCmd), + TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd), DrawTimeline {}, AnalyzeLayerMap(AnalyzeLayerMapCmd), #[command(subcommand)] @@ -68,6 +82,26 @@ struct PrintLayerFileCmd { path: Utf8PathBuf, } +/// Roll back the time for the specified prefix using S3 history. +/// +/// The command is fairly low level and powerful. Validation is only very light, +/// so it is more powerful, and thus potentially more dangerous. +#[derive(Parser)] +struct TimeTravelRemotePrefixCmd { + /// A configuration string for the remote_storage configuration. + /// + /// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }` + config_toml_str: String, + /// remote prefix to time travel recover. For safety reasons, we require it to contain + /// a timeline or tenant ID in the prefix. + prefix: String, + /// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy. + travel_to: String, + /// Timestamp of the start of the operation, must be after any changes we want to roll back and after. + /// You can use a few seconds before invoking the command. Same format as `travel_to`. + done_if_after: Option, +} + #[derive(Parser)] struct AnalyzeLayerMapCmd { /// Pageserver data path @@ -78,6 +112,14 @@ struct AnalyzeLayerMapCmd { #[tokio::main] async fn main() -> anyhow::Result<()> { + logging::init( + LogFormat::Plain, + TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + )?; + + logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let cli = CliOpts::parse(); match cli.command { @@ -105,6 +147,42 @@ async fn main() -> anyhow::Result<()> { print_layerfile(&cmd.path).await?; } } + Commands::TimeTravelRemotePrefix(cmd) => { + let timestamp = humantime::parse_rfc3339(&cmd.travel_to) + .map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?; + + let done_if_after = if let Some(done_if_after) = &cmd.done_if_after { + humantime::parse_rfc3339(done_if_after).map_err(|_e| { + anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after) + })? + } else { + const SAFETY_MARGIN: Duration = Duration::from_secs(3); + tokio::time::sleep(SAFETY_MARGIN).await; + // Convert to string representation and back to get rid of sub-second values + let done_if_after = SystemTime::now(); + tokio::time::sleep(SAFETY_MARGIN).await; + done_if_after + }; + + let timestamp = strip_subsecond(timestamp); + let done_if_after = strip_subsecond(done_if_after); + + let Some(prefix) = validate_prefix(&cmd.prefix) else { + println!("specified prefix '{}' failed validation", cmd.prefix); + return Ok(()); + }; + let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?; + let toml_item = toml_document + .get("remote_storage") + .expect("need remote_storage"); + let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config"); + let storage = remote_storage::GenericRemoteStorage::from_config(&config); + let cancel = CancellationToken::new(); + storage + .unwrap() + .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel) + .await?; + } }; Ok(()) } @@ -185,3 +263,89 @@ fn handle_metadata( Ok(()) } + +/// Ensures that the given S3 prefix is sufficiently constrained. +/// The command is very risky already and we don't want to expose something +/// that allows usually unintentional and quite catastrophic time travel of +/// an entire bucket, which would be a major catastrophy and away +/// by only one character change (similar to "rm -r /home /username/foobar"). +fn validate_prefix(prefix: &str) -> Option { + if prefix.is_empty() { + // Empty prefix means we want to specify the *whole* bucket + return None; + } + let components = prefix.split('/').collect::>(); + let (last, components) = { + let last = components.last()?; + if last.is_empty() { + ( + components.iter().nth_back(1)?, + &components[..(components.len() - 1)], + ) + } else { + (last, &components[..]) + } + }; + 'valid: { + if let Ok(_timeline_id) = TimelineId::from_str(last) { + // Ends in either a tenant or timeline ID + break 'valid; + } + if *last == "timelines" { + if let Some(before_last) = components.iter().nth_back(1) { + if let Ok(_tenant_id) = TenantShardId::from_str(before_last) { + // Has a valid tenant id + break 'valid; + } + } + } + + return None; + } + RemotePath::from_string(prefix).ok() +} + +fn strip_subsecond(timestamp: SystemTime) -> SystemTime { + let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string(); + humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate_prefix() { + assert_eq!(validate_prefix(""), None); + assert_eq!(validate_prefix("/"), None); + #[track_caller] + fn assert_valid(prefix: &str) { + let remote_path = RemotePath::from_string(prefix).unwrap(); + assert_eq!(validate_prefix(prefix), Some(remote_path)); + } + assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"); + // Path is not relative but absolute + assert_eq!( + validate_prefix( + "/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/" + ), + None + ); + assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/"); + // Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix + assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None); + assert_eq!(validate_prefix("wal"), None); + assert_eq!(validate_prefix("/wal/"), None); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001"); + // Partial tenant ID + assert_eq!( + validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"), + None + ); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683"); + assert_eq!(validate_prefix("pageserver/v1/tenants/"), None); + } +} diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 55844be041..3ae6d99aa7 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -1,4 +1,5 @@ use anyhow::Context; +use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use pageserver_client::page_service::BasebackupRequest; @@ -95,7 +96,7 @@ async fn main_impl( let timeline = *timeline; let info = mgmt_api_client .timeline_info( - timeline.tenant_id, + TenantShardId::unsharded(timeline.tenant_id), timeline.timeline_id, ForceAwaitLogicalSize::No, ) diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 2838511a77..c3d8e61a2c 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -4,6 +4,7 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key}; use pageserver_api::keyspace::KeySpaceAccum; use pageserver_api::models::PagestreamGetPageRequest; +use pageserver_api::shard::TenantShardId; use tokio_util::sync::CancellationToken; use utils::id::TenantTimelineId; use utils::lsn::Lsn; @@ -173,7 +174,10 @@ async fn main_impl( let timeline = *timeline; async move { let partitioning = mgmt_api_client - .keyspace(timeline.tenant_id, timeline.timeline_id) + .keyspace( + TenantShardId::unsharded(timeline.tenant_id), + timeline.timeline_id, + ) .await?; let lsn = partitioning.at_lsn; let start = Instant::now(); diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs index 98938d780a..f07beeecfd 100644 --- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs +++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use humantime::Duration; +use pageserver_api::shard::TenantShardId; use tokio::task::JoinSet; use utils::id::TenantTimelineId; @@ -59,7 +60,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let mgmt_api_client = Arc::clone(&mgmt_api_client); js.spawn(async move { let info = mgmt_api_client - .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes) + .timeline_info( + TenantShardId::unsharded(tl.tenant_id), + tl.timeline_id, + ForceAwaitLogicalSize::Yes, + ) .await .unwrap(); @@ -74,7 +79,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { while !info.current_logical_size_is_accurate { ticker.tick().await; info = mgmt_api_client - .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes) + .timeline_info( + TenantShardId::unsharded(tl.tenant_id), + tl.timeline_id, + ForceAwaitLogicalSize::Yes, + ) .await .unwrap(); } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index c80230d4d7..41835f9843 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -18,6 +18,7 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::WALRECEIVER_RUNTIME; use pageserver::tenant::{secondary, TenantSharedResources}; use remote_storage::GenericRemoteStorage; +use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tracing::*; @@ -284,6 +285,7 @@ fn start_pageserver( )) .unwrap(); pageserver::preinitialize_metrics(); + pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes @@ -671,42 +673,37 @@ fn start_pageserver( let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); // All started up! Now just sit and wait for shutdown signal. - { - use signal_hook::consts::*; - let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || { - let mut signals = - signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap(); - return signals - .forever() - .next() - .expect("forever() never returns None unless explicitly closed"); - }); - let signal = BACKGROUND_RUNTIME - .block_on(signal_handler) - .expect("join error"); - match signal { - SIGQUIT => { - info!("Got signal {signal}. Terminating in immediate shutdown mode",); - std::process::exit(111); - } - SIGINT | SIGTERM => { - info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); - // This cancels the `shutdown_pageserver` cancellation tree. - // Right now that tree doesn't reach very far, and `task_mgr` is used instead. - // The plan is to change that over time. - shutdown_pageserver.take(); - let bg_remote_storage = remote_storage.clone(); - let bg_deletion_queue = deletion_queue.clone(); - BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver( - &tenant_manager, - bg_remote_storage.map(|_| bg_deletion_queue), - 0, - )); - unreachable!() - } - _ => unreachable!(), - } + { + BACKGROUND_RUNTIME.block_on(async move { + let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap(); + let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); + let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap(); + let signal = tokio::select! { + _ = sigquit.recv() => { + info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",); + std::process::exit(111); + } + _ = sigint.recv() => { "SIGINT" }, + _ = sigterm.recv() => { "SIGTERM" }, + }; + + info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); + + // This cancels the `shutdown_pageserver` cancellation tree. + // Right now that tree doesn't reach very far, and `task_mgr` is used instead. + // The plan is to change that over time. + shutdown_pageserver.take(); + let bg_remote_storage = remote_storage.clone(); + let bg_deletion_queue = deletion_queue.clone(); + pageserver::shutdown_pageserver( + &tenant_manager, + bg_remote_storage.map(|_| bg_deletion_queue), + 0, + ) + .await; + unreachable!() + }) } } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 1837da34ce..e10db2b853 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -97,6 +97,8 @@ pub mod defaults { pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; + pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync"; + /// /// Default built-in configuration file. /// @@ -140,6 +142,8 @@ pub mod defaults { #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}' +#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}' + [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -290,6 +294,8 @@ pub struct PageServerConf { /// /// Setting this to zero disables limits on total ephemeral layer size. pub ephemeral_bytes_per_memory_kb: usize, + + pub walredo_process_kind: crate::walredo::ProcessKind, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -413,6 +419,8 @@ struct PageServerConfigBuilder { validate_vectored_get: BuilderValue, ephemeral_bytes_per_memory_kb: BuilderValue, + + walredo_process_kind: BuilderValue, } impl PageServerConfigBuilder { @@ -500,6 +508,8 @@ impl PageServerConfigBuilder { )), validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), + + walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()), } } } @@ -683,6 +693,10 @@ impl PageServerConfigBuilder { self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); } + pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) { + self.walredo_process_kind = BuilderValue::Set(value); + } + pub fn build(self) -> anyhow::Result { let default = Self::default_values(); @@ -739,6 +753,7 @@ impl PageServerConfigBuilder { max_vectored_read_bytes, validate_vectored_get, ephemeral_bytes_per_memory_kb, + walredo_process_kind, } CUSTOM LOGIC { @@ -1032,6 +1047,9 @@ impl PageServerConf { "ephemeral_bytes_per_memory_kb" => { builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) } + "walredo_process_kind" => { + builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?) + } _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -1114,6 +1132,7 @@ impl PageServerConf { ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), } } } @@ -1351,7 +1370,8 @@ background_task_maximum_delay = '334 s' .expect("Invalid default constant") ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, - ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB + ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), }, "Correct defaults should be used when no config values are provided" ); @@ -1423,7 +1443,8 @@ background_task_maximum_delay = '334 s' .expect("Invalid default constant") ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, - ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB + ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index f5540e896f..62bbde42f4 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker( continue; } - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // We only send consumption metrics from shard 0, so don't waste time calculating // synthetic size on other shards. continue; diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 6740c1360b..7ba2d04c4f 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics( }; let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move { - if state != TenantState::Active || !id.is_zero() { + if state != TenantState::Active || !id.is_shard_zero() { None } else { tenant_manager diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 42c800822b..f0ed46ce23 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -12,7 +12,7 @@ use pageserver_api::{ use serde::{de::DeserializeOwned, Serialize}; use tokio_util::sync::CancellationToken; use url::Url; -use utils::{backoff, generation::Generation, id::NodeId}; +use utils::{backoff, failpoint_support, generation::Generation, id::NodeId}; use crate::{ config::{NodeMetadata, PageServerConf}, @@ -210,7 +210,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { .collect(), }; - fail::fail_point!("control-plane-client-validate"); + failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel); + if self.cancel.is_cancelled() { + return Err(RetryForeverError::ShuttingDown); + } let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?; diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index bb477f89c5..d89f949688 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -58,24 +58,6 @@ paths: responses: "200": description: The reload completed successfully. - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error (also hits if no keys were found) - content: - application/json: - schema: - $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}: parameters: @@ -93,62 +75,14 @@ paths: application/json: schema: $ref: "#/components/schemas/TenantInfo" - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" delete: description: | Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved. 404 means that deletion successfully finished" responses: - "400": - description: Error when no tenant id found in path - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "404": - description: Tenant not found + description: Tenant not found. This is the success path. content: application/json: schema: @@ -165,18 +99,6 @@ paths: application/json: schema: $ref: "#/components/schemas/PreconditionFailedError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/time_travel_remote_storage: parameters: @@ -206,36 +128,6 @@ paths: application/json: schema: type: string - "400": - description: Error when no tenant id found in path or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline: parameters: @@ -255,36 +147,6 @@ paths: type: array items: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Error when no tenant id found in path - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline/{timeline_id}: @@ -309,60 +171,12 @@ paths: application/json: schema: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" delete: description: "Attempts to delete specified timeline. 500 and 409 errors should be retried" responses: - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "404": - description: Timeline not found + description: Timeline not found. This is the success path. content: application/json: schema: @@ -379,18 +193,6 @@ paths: application/json: schema: $ref: "#/components/schemas/PreconditionFailedError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn: parameters: @@ -423,36 +225,6 @@ paths: schema: type: string format: date-time - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Timeline not found, or there is no timestamp information for the given lsn - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp: parameters: @@ -484,36 +256,6 @@ paths: application/json: schema: $ref: "#/components/schemas/LsnByTimestampResponse" - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc: parameters: @@ -537,36 +279,6 @@ paths: application/json: schema: type: string - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_shard_id}/location_config: parameters: - name: tenant_shard_id @@ -628,24 +340,6 @@ paths: application/json: schema: $ref: "#/components/schemas/TenantLocationConfigResponse" - "503": - description: Tenant's state cannot be changed right now. Wait a few seconds and retry. - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "409": description: | The tenant is already known to Pageserver in some way, @@ -662,12 +356,6 @@ paths: application/json: schema: $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/ignore: parameters: - name: tenant_id @@ -684,36 +372,6 @@ paths: responses: "200": description: Tenant ignored - "400": - description: Error when no tenant id found in path parameters - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/load: @@ -740,36 +398,6 @@ paths: responses: "202": description: Tenant scheduled to load successfully - "400": - description: Error when no tenant id found in path parameters - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive: parameters: @@ -790,37 +418,6 @@ paths: responses: "202": description: Tenant scheduled to load successfully - "404": - description: No tenant or timeline found for the specified ids - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - /v1/tenant/{tenant_id}/synthetic_size: parameters: @@ -839,31 +436,8 @@ paths: application/json: schema: $ref: "#/components/schemas/SyntheticSizeResponse" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" + # This route has no handler. TODO: remove? /v1/tenant/{tenant_id}/size: parameters: - name: tenant_id @@ -945,18 +519,6 @@ paths: responses: "200": description: Success - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_shard_id}/secondary/download: parameters: @@ -987,20 +549,6 @@ paths: application/json: schema: $ref: "#/components/schemas/SecondaryProgress" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - /v1/tenant/{tenant_id}/timeline/: parameters: @@ -1043,24 +591,6 @@ paths: application/json: schema: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Malformed timeline create request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "406": description: Permanently unsatisfiable request, don't retry. content: @@ -1079,18 +609,6 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/: get: @@ -1104,30 +622,6 @@ paths: type: array items: $ref: "#/components/schemas/TenantInfo" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" post: description: | @@ -1148,43 +642,12 @@ paths: application/json: schema: type: string - "400": - description: Malformed tenant create request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "409": description: Tenant already exists, creation skipped content: application/json: schema: $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - /v1/tenant/config: put: @@ -1206,36 +669,6 @@ paths: type: array items: $ref: "#/components/schemas/TenantInfo" - "400": - description: Malformed tenant config request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/config/: parameters: @@ -1255,42 +688,6 @@ paths: application/json: schema: $ref: "#/components/schemas/TenantConfigResponse" - "400": - description: Malformed get tenanant config request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Tenand or timeline were not found - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/utilization: get: @@ -1304,12 +701,6 @@ paths: application/json: schema: $ref: "#/components/schemas/PageserverUtilization" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" components: securitySchemes: @@ -1629,7 +1020,7 @@ components: type: integer format: int64 minimum: 0 - description: The amount of disk space currently utilized by layer files. + description: The amount of disk space currently used. free_space_bytes: type: integer format: int64 diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 759a1b25ee..20258dd950 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -457,8 +457,12 @@ async fn reload_auth_validation_keys_handler( json_response(StatusCode::OK, ()) } Err(e) => { + let err_msg = "Error reloading public keys"; warn!("Error reloading public keys from {key_path:?}: {e:}"); - json_response(StatusCode::INTERNAL_SERVER_ERROR, ()) + json_response( + StatusCode::INTERNAL_SERVER_ERROR, + HttpErrorBody::from_msg(err_msg.to_string()), + ) } } } @@ -696,7 +700,7 @@ async fn get_lsn_by_timestamp_handler( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" @@ -747,7 +751,7 @@ async fn get_timestamp_of_lsn_handler( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" @@ -772,7 +776,9 @@ async fn get_timestamp_of_lsn_handler( let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string(); json_response(StatusCode::OK, time) } - None => json_response(StatusCode::NOT_FOUND, ()), + None => Err(ApiError::NotFound( + anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(), + )), } } @@ -993,11 +999,26 @@ async fn tenant_status( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); + // In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting. + let activate = true; + #[cfg(feature = "testing")] + let activate = parse_query_param(&request, "activate")?.unwrap_or(activate); + let tenant_info = async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; + if activate { + // This is advisory: we prefer to let the tenant activate on-demand when this function is + // called, but it is still valid to return 200 and describe the current state of the tenant + // if it doesn't make it into an active state. + tenant + .wait_to_become_active(ACTIVE_TENANT_TIMEOUT) + .await + .ok(); + } + // Calculate total physical size of all timelines let mut current_physical_size = 0; for timeline in tenant.list_timelines().iter() { @@ -1071,7 +1092,7 @@ async fn tenant_size_handler( let headers = request.headers(); let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" ))); diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 343dec2ca1..ed409d3130 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use camino::Utf8Path; use futures::StreamExt; +use pageserver_api::key::rel_block_to_key; use tokio::io::{AsyncRead, AsyncReadExt}; use tokio_tar::Archive; use tracing::*; @@ -170,7 +171,10 @@ async fn import_rel( let r = reader.read_exact(&mut buf).await; match r { Ok(_) => { - modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; + let key = rel_block_to_key(rel, blknum); + if modification.tline.get_shard_identity().is_key_local(&key) { + modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; + } } // TODO: UnexpectedEof is expected diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index cc661194e9..be61a755ff 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1483,12 +1483,18 @@ pub(crate) static DELETION_QUEUE: Lazy = Lazy::new(|| { }); pub(crate) struct WalIngestMetrics { + pub(crate) bytes_received: IntCounter, pub(crate) records_received: IntCounter, pub(crate) records_committed: IntCounter, pub(crate) records_filtered: IntCounter, } pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { + bytes_received: register_int_counter!( + "pageserver_wal_ingest_bytes_received", + "Bytes of WAL ingested from safekeepers", + ) + .unwrap(), records_received: register_int_counter!( "pageserver_wal_ingest_records_received", "Number of WAL records received from safekeepers" @@ -1813,6 +1819,29 @@ impl Default for WalRedoProcessCounters { pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy = Lazy::new(WalRedoProcessCounters::default); +#[cfg(not(test))] +pub mod wal_redo { + use super::*; + + static PROCESS_KIND: Lazy> = Lazy::new(|| { + std::sync::Mutex::new( + register_uint_gauge_vec!( + "pageserver_wal_redo_process_kind", + "The configured process kind for walredo", + &["kind"], + ) + .unwrap(), + ) + }); + + pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) { + // use guard to avoid races around the next two steps + let guard = PROCESS_KIND.lock().unwrap(); + guard.reset(); + guard.with_label_values(&[&format!("{kind}")]).set(1); + } +} + /// Similar to `prometheus::HistogramTimer` but does not record on drop. pub(crate) struct StorageTimeMetricsTimer { metrics: StorageTimeMetrics, @@ -2083,7 +2112,7 @@ impl TimelineMetrics { pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { // Only shard zero deals in synthetic sizes - if tenant_shard_id.is_zero() { + if tenant_shard_id.is_shard_zero() { let tid = tenant_shard_id.tenant_id.to_string(); let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); } @@ -2094,6 +2123,7 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { use futures::Future; use pin_project_lite::pin_project; use std::collections::HashMap; +use std::num::NonZeroUsize; use std::pin::Pin; use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; @@ -2663,6 +2693,26 @@ pub(crate) mod disk_usage_based_eviction { pub(crate) static METRICS: Lazy = Lazy::new(Metrics::default); } +static TOKIO_EXECUTOR_THREAD_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_tokio_executor_thread_configured_count", + "Total number of configued tokio executor threads in the process. + The `setup` label denotes whether we're running with multiple runtimes or a single runtime.", + &["setup"], + ) + .unwrap() +}); + +pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) { + static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(()); + let _guard = SERIALIZE.lock().unwrap(); + TOKIO_EXECUTOR_THREAD_COUNT.reset(); + TOKIO_EXECUTOR_THREAD_COUNT + .get_metric_with_label_values(&[setup]) + .unwrap() + .set(u64::try_from(num_threads.get()).unwrap()); +} + pub fn preinitialize_metrics() { // Python tests need these and on some we do alerting. // diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 3d622f1871..3b9a30ba4c 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -876,7 +876,13 @@ impl PageServerHandler { if lsn <= last_record_lsn { lsn = last_record_lsn; } else { - timeline.wait_lsn(lsn, ctx).await?; + timeline + .wait_lsn( + lsn, + crate::tenant::timeline::WaitLsnWaiter::PageService, + ctx, + ) + .await?; // Since we waited for 'lsn' to arrive, that is now the last // record LSN. (Or close enough for our purposes; the // last-record LSN can advance immediately after we return @@ -888,7 +894,13 @@ impl PageServerHandler { "invalid LSN(0) in request".into(), )); } - timeline.wait_lsn(lsn, ctx).await?; + timeline + .wait_lsn( + lsn, + crate::tenant::timeline::WaitLsnWaiter::PageService, + ctx, + ) + .await?; } if lsn < **latest_gc_cutoff_lsn { @@ -1215,7 +1227,13 @@ impl PageServerHandler { if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", lsn); - timeline.wait_lsn(lsn, ctx).await?; + timeline + .wait_lsn( + lsn, + crate::tenant::timeline::WaitLsnWaiter::PageService, + ctx, + ) + .await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 69e163effa..9a1e354ecf 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -33,13 +33,14 @@ use std::collections::HashMap; use std::fmt; use std::future::Future; +use std::num::NonZeroUsize; use std::panic::AssertUnwindSafe; +use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use futures::FutureExt; use pageserver_api::shard::TenantShardId; -use tokio::runtime::Runtime; use tokio::task::JoinHandle; use tokio::task_local; use tokio_util::sync::CancellationToken; @@ -48,8 +49,11 @@ use tracing::{debug, error, info, warn}; use once_cell::sync::Lazy; +use utils::env; use utils::id::TimelineId; +use crate::metrics::set_tokio_runtime_setup; + // // There are four runtimes: // @@ -98,52 +102,119 @@ use utils::id::TimelineId; // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't // happen, but still. // -pub static COMPUTE_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("compute request worker") - .enable_all() - .build() - .expect("Failed to create compute request runtime") -}); -pub static MGMT_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("mgmt request worker") - .enable_all() - .build() - .expect("Failed to create mgmt request runtime") -}); - -pub static WALRECEIVER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("walreceiver worker") - .enable_all() - .build() - .expect("Failed to create walreceiver runtime") -}); - -pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("background op worker") - // if you change the number of worker threads please change the constant below - .enable_all() - .build() - .expect("Failed to create background op runtime") -}); - -pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy = Lazy::new(|| { - // force init and thus panics - let _ = BACKGROUND_RUNTIME.handle(); +pub(crate) static TOKIO_WORKER_THREADS: Lazy = Lazy::new(|| { // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly // tokio would had already panicked for parsing errors or NotUnicode // // this will be wrong if any of the runtimes gets their worker threads configured to something // else, but that has not been needed in a long time. - std::env::var("TOKIO_WORKER_THREADS") - .map(|s| s.parse::().unwrap()) - .unwrap_or_else(|_e| usize::max(2, num_cpus::get())) + NonZeroUsize::new( + std::env::var("TOKIO_WORKER_THREADS") + .map(|s| s.parse::().unwrap()) + .unwrap_or_else(|_e| usize::max(2, num_cpus::get())), + ) + .expect("the max() ensures that this is not zero") }); +enum TokioRuntimeMode { + SingleThreaded, + MultiThreaded { num_workers: NonZeroUsize }, +} + +impl FromStr for TokioRuntimeMode { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "current_thread" => Ok(TokioRuntimeMode::SingleThreaded), + s => match s.strip_prefix("multi_thread:") { + Some("default") => Ok(TokioRuntimeMode::MultiThreaded { + num_workers: *TOKIO_WORKER_THREADS, + }), + Some(suffix) => { + let num_workers = suffix.parse::().map_err(|e| { + format!( + "invalid number of multi-threaded runtime workers ({suffix:?}): {e}", + ) + })?; + Ok(TokioRuntimeMode::MultiThreaded { num_workers }) + } + None => Err(format!("invalid runtime config: {s:?}")), + }, + } + } +} + +static ONE_RUNTIME: Lazy> = Lazy::new(|| { + let thread_name = "pageserver-tokio"; + let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else { + // If the env var is not set, leave this static as None. + set_tokio_runtime_setup( + "multiple-runtimes", + NUM_MULTIPLE_RUNTIMES + .checked_mul(*TOKIO_WORKER_THREADS) + .unwrap(), + ); + return None; + }; + Some(match mode { + TokioRuntimeMode::SingleThreaded => { + set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap()); + tokio::runtime::Builder::new_current_thread() + .thread_name(thread_name) + .enable_all() + .build() + .expect("failed to create one single runtime") + } + TokioRuntimeMode::MultiThreaded { num_workers } => { + set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers); + tokio::runtime::Builder::new_multi_thread() + .thread_name(thread_name) + .enable_all() + .worker_threads(num_workers.get()) + .build() + .expect("failed to create one multi-threaded runtime") + } + }) +}); + +/// Declare a lazy static variable named `$varname` that will resolve +/// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME` +/// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation +/// declares a separate runtime and the lazy static variable `$varname` +/// will resolve to that separate runtime. +/// +/// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if +/// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime +/// otherwise. +macro_rules! pageserver_runtime { + ($varname:ident, $name:literal) => { + pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| { + if let Some(runtime) = &*ONE_RUNTIME { + return runtime; + } + static RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name($name) + .worker_threads(TOKIO_WORKER_THREADS.get()) + .enable_all() + .build() + .expect(std::concat!("Failed to create runtime ", $name)) + }); + &*RUNTIME + }); + }; +} + +pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker"); +pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker"); +pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker"); +pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker"); +// Bump this number when adding a new pageserver_runtime! +// SAFETY: it's obviously correct +const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) }; + #[derive(Debug, Clone, Copy)] pub struct PageserverTaskId(u64); @@ -214,13 +285,12 @@ pub enum TaskKind { /// Internally, `Client` hands over requests to the `Connection` object. /// The `Connection` object is responsible for speaking the wire protocol. /// - /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection. - /// That abstraction doesn't use `task_mgr`. + /// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection. /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task. /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind. /// - /// Once the connection is established, the `TaskHandle` task creates a - /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling + /// Once the connection is established, the `TaskHandle` task spawns a + /// [`WalReceiverConnectionPoller`] task that is responsible for polling /// the `Connection` object. /// A `CancellationToken` created by the `TaskHandle` task ensures /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped. @@ -230,7 +300,6 @@ pub enum TaskKind { WalReceiverManager, /// The `TaskHandle` task that executes `handle_walreceiver_connection`. - /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`. /// See the comment on [`WalReceiverManager`]. /// /// [`WalReceiverManager`]: Self::WalReceiverManager diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 792d9e548d..35ea037a55 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -12,6 +12,7 @@ //! use anyhow::{bail, Context}; +use arc_swap::ArcSwap; use camino::Utf8Path; use camino::Utf8PathBuf; use enumset::EnumSet; @@ -98,7 +99,7 @@ use std::ops::Bound::Included; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; -use std::sync::{Mutex, RwLock}; +use std::sync::Mutex; use std::time::{Duration, Instant}; use crate::span; @@ -260,7 +261,7 @@ pub struct Tenant { // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. // This is necessary to allow global config updates. - tenant_conf: Arc>, + tenant_conf: Arc>, tenant_shard_id: TenantShardId, @@ -385,7 +386,7 @@ impl WalRedoManager { pub(crate) fn status(&self) -> Option { match self { - WalRedoManager::Prod(m) => m.status(), + WalRedoManager::Prod(m) => Some(m.status()), #[cfg(test)] WalRedoManager::Test(_) => None, } @@ -1515,7 +1516,7 @@ impl Tenant { // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. ancestor_timeline - .wait_lsn(*lsn, ctx) + .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx) .await .map_err(|e| match e { e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => { @@ -1606,7 +1607,7 @@ impl Tenant { ); { - let conf = self.tenant_conf.read().unwrap(); + let conf = self.tenant_conf.load(); if !conf.location.may_delete_layers_hint() { info!("Skipping GC in location state {:?}", conf.location); @@ -1633,7 +1634,7 @@ impl Tenant { } { - let conf = self.tenant_conf.read().unwrap(); + let conf = self.tenant_conf.load(); if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() { info!("Skipping compaction in location state {:?}", conf.location); return Ok(()); @@ -1782,7 +1783,7 @@ impl Tenant { async fn shutdown( &self, shutdown_progress: completion::Barrier, - freeze_and_flush: bool, + shutdown_mode: timeline::ShutdownMode, ) -> Result<(), completion::Barrier> { span::debug_assert_current_span_has_tenant_id(); @@ -1829,16 +1830,8 @@ impl Tenant { timelines.values().for_each(|timeline| { let timeline = Arc::clone(timeline); let timeline_id = timeline.timeline_id; - - let span = - tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush); - js.spawn(async move { - if freeze_and_flush { - timeline.flush_and_shutdown().instrument(span).await - } else { - timeline.shutdown().instrument(span).await - } - }); + let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode); + js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await }); }) }; // test_long_timeline_create_then_tenant_delete is leaning on this message @@ -2082,14 +2075,14 @@ impl Tenant { } pub(crate) fn get_attach_mode(&self) -> AttachmentMode { - self.tenant_conf.read().unwrap().location.attach_mode + self.tenant_conf.load().location.attach_mode } /// For API access: generate a LocationConfig equivalent to the one that would be used to /// create a Tenant in the same state. Do not use this in hot paths: it's for relatively /// rare external API calls, like a reconciliation at startup. pub(crate) fn get_location_conf(&self) -> models::LocationConfig { - let conf = self.tenant_conf.read().unwrap(); + let conf = self.tenant_conf.load(); let location_config_mode = match conf.location.attach_mode { AttachmentMode::Single => models::LocationConfigMode::AttachedSingle, @@ -2236,7 +2229,7 @@ where impl Tenant { pub fn tenant_specific_overrides(&self) -> TenantConfOpt { - self.tenant_conf.read().unwrap().tenant_conf.clone() + self.tenant_conf.load().tenant_conf.clone() } pub fn effective_config(&self) -> TenantConf { @@ -2245,84 +2238,84 @@ impl Tenant { } pub fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } pub fn get_checkpoint_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } pub fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } pub fn get_compaction_period(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_period .unwrap_or(self.conf.default_tenant_conf.compaction_period) } pub fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } pub fn get_gc_horizon(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .gc_horizon .unwrap_or(self.conf.default_tenant_conf.gc_horizon) } pub fn get_gc_period(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .gc_period .unwrap_or(self.conf.default_tenant_conf.gc_period) } pub fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } pub fn get_pitr_interval(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .pitr_interval .unwrap_or(self.conf.default_tenant_conf.pitr_interval) } pub fn get_trace_read_requests(&self) -> bool { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .trace_read_requests .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) } pub fn get_min_resident_size_override(&self) -> Option { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .min_resident_size_override .or(self.conf.default_tenant_conf.min_resident_size_override) } pub fn get_heatmap_period(&self) -> Option { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); let heatmap_period = tenant_conf .heatmap_period .unwrap_or(self.conf.default_tenant_conf.heatmap_period); @@ -2334,26 +2327,40 @@ impl Tenant { } pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { - self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf; - self.tenant_conf_updated(); + // Use read-copy-update in order to avoid overwriting the location config + // state if this races with [`Tenant::set_new_location_config`]. Note that + // this race is not possible if both request types come from the storage + // controller (as they should!) because an exclusive op lock is required + // on the storage controller side. + self.tenant_conf.rcu(|inner| { + Arc::new(AttachedTenantConf { + tenant_conf: new_tenant_conf.clone(), + location: inner.location, + }) + }); + + self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(); + timeline.tenant_conf_updated(&new_tenant_conf); } } pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) { - *self.tenant_conf.write().unwrap() = new_conf; - self.tenant_conf_updated(); + let new_tenant_conf = new_conf.tenant_conf.clone(); + + self.tenant_conf.store(Arc::new(new_conf)); + + self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(); + timeline.tenant_conf_updated(&new_tenant_conf); } } @@ -2367,11 +2374,8 @@ impl Tenant { .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone()) } - pub(crate) fn tenant_conf_updated(&self) { - let conf = { - let guard = self.tenant_conf.read().unwrap(); - Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf) - }; + pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { + let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf); self.timeline_get_throttle.reconfigure(conf) } @@ -2519,7 +2523,7 @@ impl Tenant { Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf), &crate::metrics::tenant_throttling::TIMELINE_GET, )), - tenant_conf: Arc::new(RwLock::new(attached_conf)), + tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), } } @@ -3186,7 +3190,7 @@ impl Tenant { run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?; // Upload the created data dir to S3 - if self.tenant_shard_id().is_zero() { + if self.tenant_shard_id().is_shard_zero() { self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id) .await?; } @@ -3433,7 +3437,7 @@ impl Tenant { .store(size, Ordering::Relaxed); // Only shard zero should be calculating synthetic sizes - debug_assert!(self.shard_identity.is_zero()); + debug_assert!(self.shard_identity.is_shard_zero()); TENANT_SYNTHETIC_SIZE_METRIC .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()]) @@ -3505,7 +3509,7 @@ impl Tenant { } pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt { - self.tenant_conf.read().unwrap().tenant_conf.clone() + self.tenant_conf.load().tenant_conf.clone() } } @@ -3653,6 +3657,9 @@ pub(crate) mod harness { heatmap_period: Some(tenant_conf.heatmap_period), lazy_slru_download: Some(tenant_conf.lazy_slru_download), timeline_get_throttle: Some(tenant_conf.timeline_get_throttle), + image_layer_creation_check_threshold: Some( + tenant_conf.image_layer_creation_check_threshold, + ), } } } @@ -3851,6 +3858,7 @@ mod tests { use hex_literal::hex; use pageserver_api::keyspace::KeySpace; use rand::{thread_rng, Rng}; + use tests::timeline::ShutdownMode; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); @@ -4296,7 +4304,7 @@ mod tests { make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?; // so that all uploads finish & we can call harness.load() below again tenant - .shutdown(Default::default(), true) + .shutdown(Default::default(), ShutdownMode::FreezeAndFlush) .instrument(harness.span()) .await .ok() @@ -4337,7 +4345,7 @@ mod tests { // so that all uploads finish & we can call harness.load() below again tenant - .shutdown(Default::default(), true) + .shutdown(Default::default(), ShutdownMode::FreezeAndFlush) .instrument(harness.span()) .await .ok() @@ -5118,7 +5126,7 @@ mod tests { // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again let raw_tline = tline.raw_timeline().unwrap(); raw_tline - .shutdown() + .shutdown(super::timeline::ShutdownMode::Hard) .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID)) .await; std::mem::forget(tline); diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 53a8c97e23..a2bb479f63 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -57,6 +57,9 @@ pub mod defaults { // throughputs up to 1GiB/s per timeline. pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024; pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; + // By default ingest enough WAL for two new L0 layers before checking if new image + // image layers should be created. + pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; } @@ -362,6 +365,10 @@ pub struct TenantConf { pub lazy_slru_download: bool, pub timeline_get_throttle: pageserver_api::models::ThrottleConfig, + + // How much WAL must be ingested before checking again whether a new image layer is required. + // Expresed in multiples of checkpoint distance. + pub image_layer_creation_check_threshold: u8, } /// Same as TenantConf, but this struct preserves the information about @@ -454,6 +461,9 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] pub timeline_get_throttle: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub image_layer_creation_check_threshold: Option, } impl TenantConfOpt { @@ -508,6 +518,9 @@ impl TenantConfOpt { .timeline_get_throttle .clone() .unwrap_or(global_conf.timeline_get_throttle), + image_layer_creation_check_threshold: self + .image_layer_creation_check_threshold + .unwrap_or(global_conf.image_layer_creation_check_threshold), } } } @@ -548,6 +561,7 @@ impl Default for TenantConf { heatmap_period: Duration::ZERO, lazy_slru_download: false, timeline_get_throttle: crate::tenant::throttle::Config::disabled(), + image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, } } } @@ -621,6 +635,7 @@ impl From for models::TenantConfig { heatmap_period: value.heatmap_period.map(humantime), lazy_slru_download: value.lazy_slru_download, timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from), + image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, } } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 7d37873a67..33d0f677e5 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -14,7 +14,10 @@ use crate::{ config::PageServerConf, context::RequestContext, task_mgr::{self, TaskKind}, - tenant::mgr::{TenantSlot, TenantsMapRemoveResult}, + tenant::{ + mgr::{TenantSlot, TenantsMapRemoveResult}, + timeline::ShutdownMode, + }, }; use super::{ @@ -433,6 +436,11 @@ impl DeleteTenantFlow { .await } + /// Check whether background deletion of this tenant is currently in progress + pub(crate) fn is_in_progress(tenant: &Tenant) -> bool { + tenant.delete_progress.try_lock().is_err() + } + async fn prepare( tenant: &Arc, ) -> Result, DeleteTenantError> { @@ -463,7 +471,7 @@ impl DeleteTenantFlow { // tenant.shutdown // Its also bad that we're holding tenants.read here. // TODO relax set_stopping to be idempotent? - if tenant.shutdown(progress, false).await.is_err() { + if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() { return Err(DeleteTenantError::Other(anyhow::anyhow!( "tenant shutdown is already in progress" ))); diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 4d75f9437f..52d02fd01e 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -78,6 +78,10 @@ impl EphemeralFile { self.file.bytes_written() } + pub(crate) fn id(&self) -> page_cache::FileId { + self.page_cache_file_id + } + pub(crate) async fn read_blk( &self, blknum: u32, diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index b8ed69052f..4c4cd90c99 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -346,35 +346,6 @@ where } } -#[derive(PartialEq, Eq, Hash, Debug, Clone)] -pub enum InMemoryLayerHandle { - Open { - lsn_floor: Lsn, - end_lsn: Lsn, - }, - Frozen { - idx: usize, - lsn_floor: Lsn, - end_lsn: Lsn, - }, -} - -impl InMemoryLayerHandle { - pub fn get_lsn_floor(&self) -> Lsn { - match self { - InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor, - InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor, - } - } - - pub fn get_end_lsn(&self) -> Lsn { - match self { - InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn, - InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn, - } - } -} - impl LayerMap { /// /// Find the latest layer (by lsn.end) that covers the given @@ -576,41 +547,18 @@ impl LayerMap { self.historic.iter() } - /// Get a handle for the first in memory layer that matches the provided predicate. - /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer. - /// - /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during - /// the same exclusive region established by holding the layer manager lock. - pub fn find_in_memory_layer(&self, mut pred: Pred) -> Option + /// Get a ref counted pointer for the first in memory layer that matches the provided predicate. + pub fn find_in_memory_layer(&self, mut pred: Pred) -> Option> where Pred: FnMut(&Arc) -> bool, { if let Some(open) = &self.open_layer { if pred(open) { - return Some(InMemoryLayerHandle::Open { - lsn_floor: open.get_lsn_range().start, - end_lsn: open.get_lsn_range().end, - }); + return Some(open.clone()); } } - let pos = self.frozen_layers.iter().rev().position(pred); - pos.map(|rev_idx| { - let idx = self.frozen_layers.len() - 1 - rev_idx; - InMemoryLayerHandle::Frozen { - idx, - lsn_floor: self.frozen_layers[idx].get_lsn_range().start, - end_lsn: self.frozen_layers[idx].get_lsn_range().end, - } - }) - } - - /// Get the layer pointed to by the provided handle. - pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option> { - match handle { - InMemoryLayerHandle::Open { .. } => self.open_layer.clone(), - InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(), - } + self.frozen_layers.iter().rfind(|l| pred(l)).cloned() } /// diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index f01fb9791c..73967f2949 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -44,6 +44,7 @@ use crate::tenant::config::{ use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::span::debug_assert_current_span_has_tenant_id; use crate::tenant::storage_layer::inmemory_layer; +use crate::tenant::timeline::ShutdownMode; use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState}; use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX}; @@ -783,11 +784,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone())); join_set.spawn( async move { - let freeze_and_flush = true; - let res = { let (_guard, shutdown_progress) = completion::channel(); - t.shutdown(shutdown_progress, freeze_and_flush).await + t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await }; if let Err(other_progress) = res { @@ -1107,7 +1106,7 @@ impl TenantManager { }; info!("Shutting down attached tenant"); - match tenant.shutdown(progress, false).await { + match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => {} Err(barrier) => { info!("Shutdown already in progress, waiting for it to complete"); @@ -1223,7 +1222,7 @@ impl TenantManager { TenantSlot::Attached(tenant) => { let (_guard, progress) = utils::completion::channel(); info!("Shutting down just-spawned tenant, because tenant manager is shut down"); - match tenant.shutdown(progress, false).await { + match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { info!("Finished shutting down just-spawned tenant"); } @@ -1273,7 +1272,7 @@ impl TenantManager { }; let (_guard, progress) = utils::completion::channel(); - match tenant.shutdown(progress, false).await { + match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { slot_guard.drop_old_value()?; } @@ -1411,9 +1410,15 @@ impl TenantManager { match tenant.current_state() { TenantState::Broken { .. } | TenantState::Stopping { .. } => { - // If a tenant is broken or stopping, DeleteTenantFlow can - // handle it: broken tenants proceed to delete, stopping tenants - // are checked for deletion already in progress. + // If deletion is already in progress, return success (the semantics of this + // function are to rerturn success afterr deletion is spawned in background). + // Otherwise fall through and let [`DeleteTenantFlow`] handle this state. + if DeleteTenantFlow::is_in_progress(&tenant) { + // The `delete_progress` lock is held: deletion is already happening + // in the bacckground + slot_guard.revert(); + return Ok(()); + } } _ => { tenant @@ -1649,7 +1654,14 @@ impl TenantManager { fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!( "failpoint" ))); - if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await { + if let Err(e) = timeline + .wait_lsn( + *target_lsn, + crate::tenant::timeline::WaitLsnWaiter::Tenant, + ctx, + ) + .await + { // Failure here might mean shutdown, in any case this part is an optimization // and we shouldn't hold up the split operation. tracing::warn!( @@ -1670,7 +1682,7 @@ impl TenantManager { // Phase 5: Shut down the parent shard, and erase it from disk let (_guard, progress) = completion::channel(); - match parent.shutdown(progress, false).await { + match parent.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => {} Err(other) => { other.wait().await; @@ -2657,11 +2669,11 @@ where let attached_tenant = match slot_guard.get_old_value() { Some(TenantSlot::Attached(tenant)) => { // whenever we remove a tenant from memory, we don't want to flush and wait for upload - let freeze_and_flush = false; + let shutdown_mode = ShutdownMode::Hard; // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so // that we can continue safely to cleanup. - match tenant.shutdown(progress, freeze_and_flush).await { + match tenant.shutdown(progress, shutdown_mode).await { Ok(()) => {} Err(_other) => { // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index cbd942d706..3879135f26 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -200,6 +200,7 @@ use utils::backoff::{ use std::collections::{HashMap, VecDeque}; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; +use std::time::Duration; use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use std::ops::DerefMut; @@ -207,7 +208,7 @@ use tracing::{debug, error, info, instrument, warn}; use tracing::{info_span, Instrument}; use utils::lsn::Lsn; -use crate::deletion_queue::DeletionQueueClient; +use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::metrics::{ MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, @@ -261,6 +262,10 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst"; /// Default buffer size when interfacing with [`tokio::fs::File`]. pub(crate) const BUFFER_SIZE: usize = 32 * 1024; +/// Doing non-essential flushes of deletion queue is subject to this timeout, after +/// which we warn and skip. +const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10); + pub enum MaybeDeletedIndexPart { IndexPart(IndexPart), Deleted(IndexPart), @@ -588,14 +593,14 @@ impl RemoteTimelineClient { upload_queue: &mut UploadQueueInitialized, metadata: TimelineMetadata, ) { + let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); + info!( - "scheduling metadata upload with {} files ({} changed)", + "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)", upload_queue.latest_files.len(), upload_queue.latest_files_changes_since_metadata_upload_scheduled, ); - let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); - let index_part = IndexPart::new( upload_queue.latest_files.clone(), disk_consistent_lsn, @@ -1050,6 +1055,26 @@ impl RemoteTimelineClient { Ok(()) } + async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> { + match tokio::time::timeout( + DELETION_QUEUE_FLUSH_TIMEOUT, + self.deletion_queue_client.flush_immediate(), + ) + .await + { + Ok(result) => result, + Err(_timeout) => { + // Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and + // to ensure that _usually_ objects are really gone after a DELETE is acked. However, in case of deletion + // queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here. + tracing::warn!( + "Timed out waiting for deletion queue flush, acking deletion anyway" + ); + Ok(()) + } + } + } + /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set. /// The function deletes layer files one by one, then lists the prefix to see if we leaked something /// deletes leaked files if any and proceeds with deletion of index file at the end. @@ -1099,7 +1124,7 @@ impl RemoteTimelineClient { // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't // taking the burden of listing all the layers that we already know we should delete. - self.deletion_queue_client.flush_immediate().await?; + self.flush_deletion_queue().await?; let cancel = shutdown_token(); @@ -1173,7 +1198,7 @@ impl RemoteTimelineClient { // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait // for a flush to a persistent deletion list so that we may be sure deletion will occur. - self.deletion_queue_client.flush_immediate().await?; + self.flush_deletion_queue().await?; fail::fail_point!("timeline-delete-after-index-delete", |_| { Err(anyhow::anyhow!( @@ -1569,7 +1594,7 @@ impl RemoteTimelineClient { /// Use [`RemoteTimelineClient::shutdown`] for graceful stop. /// /// In-progress operations will still be running after this function returns. - /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))` + /// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))` /// to wait for them to complete, after calling this function. pub(crate) fn stop(&self) { // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 137fe48b73..0227331953 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -167,7 +167,7 @@ pub(crate) async fn time_travel_recover_tenant( let warn_after = 3; let max_attempts = 10; let mut prefixes = Vec::with_capacity(2); - if tenant_shard_id.is_zero() { + if tenant_shard_id.is_shard_zero() { // Also recover the unsharded prefix for a shard of zero: // - if the tenant is totally unsharded, the unsharded prefix contains all the data // - if the tenant is sharded, we still want to recover the initdb data, but we only diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 8782a9f04e..5b29c126d1 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -51,7 +51,7 @@ use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, warn, Instrument}; use utils::{ backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, - id::TimelineId, + id::TimelineId, serde_system_time, }; use super::{ @@ -591,7 +591,7 @@ impl<'a> TenantDownloader<'a> { let mut progress = SecondaryProgress { layers_total: heatmap_stats.layers, bytes_total: heatmap_stats.bytes, - heatmap_mtime: Some(heatmap_mtime), + heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)), layers_downloaded: 0, bytes_downloaded: 0, }; @@ -786,6 +786,35 @@ impl<'a> TenantDownloader<'a> { // Existing on-disk layers: just update their access time. if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) { tracing::debug!("Layer {} is already on disk", layer.name); + + if cfg!(debug_assertions) { + // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think + // are already present on disk are really there. + let local_path = self + .conf + .timeline_path(tenant_shard_id, &timeline.timeline_id) + .join(layer.name.file_name()); + match tokio::fs::metadata(&local_path).await { + Ok(meta) => { + tracing::debug!( + "Layer {} present at {}, size {}", + layer.name, + local_path, + meta.len(), + ); + } + Err(e) => { + tracing::warn!( + "Layer {} not found at {} ({})", + layer.name, + local_path, + e + ); + debug_assert!(false); + } + } + } + if on_disk.metadata != LayerFileMetadata::from(&layer.metadata) || on_disk.access_time != layer.access_time { diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index f44a92a2d7..9a2b086828 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse}; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; use std::ops::Range; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tracing::warn; use utils::history_buffer::HistoryBufferWithDropCounter; @@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; -use super::layer_map::InMemoryLayerHandle; -use super::timeline::layer_manager::LayerManager; +use self::inmemory_layer::InMemoryLayerFileId; + use super::timeline::GetVectoredError; use super::PageReconstructError; @@ -204,23 +204,30 @@ impl Default for ValuesReconstructState { } } -/// Description of layer to be read - the layer map can turn -/// this description into the actual layer. -#[derive(PartialEq, Eq, Hash, Debug, Clone)] -pub(crate) enum ReadableLayerDesc { - Persistent { - desc: PersistentLayerDesc, - lsn_range: Range, - }, - InMemory { - handle: InMemoryLayerHandle, - lsn_ceil: Lsn, - }, +/// A key that uniquely identifies a layer in a timeline +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub(crate) enum LayerId { + PersitentLayerId(PersistentLayerKey), + InMemoryLayerId(InMemoryLayerFileId), } -/// Wraper for 'ReadableLayerDesc' sorted by Lsn +/// Layer wrapper for the read path. Note that it is valid +/// to use these layers even after external operations have +/// been performed on them (compaction, freeze, etc.). #[derive(Debug)] -struct ReadableLayerDescOrdered(ReadableLayerDesc); +pub(crate) enum ReadableLayer { + PersistentLayer(Layer), + InMemoryLayer(Arc), +} + +/// A partial description of a read to be done. +#[derive(Debug, Clone)] +struct ReadDesc { + /// An id used to resolve the readable layer within the fringe + layer_id: LayerId, + /// Lsn range for the read, used for selecting the next read + lsn_range: Range, +} /// Data structure which maintains a fringe of layers for the /// read path. The fringe is the set of layers which intersects @@ -231,41 +238,64 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc); /// a two layer indexing scheme. #[derive(Debug)] pub(crate) struct LayerFringe { - layers_by_lsn: BinaryHeap, - layers: HashMap, + planned_reads_by_lsn: BinaryHeap, + layers: HashMap, +} + +#[derive(Debug)] +struct LayerKeyspace { + layer: ReadableLayer, + target_keyspace: KeySpace, } impl LayerFringe { pub(crate) fn new() -> Self { LayerFringe { - layers_by_lsn: BinaryHeap::new(), + planned_reads_by_lsn: BinaryHeap::new(), layers: HashMap::new(), } } - pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> { - let handle = match self.layers_by_lsn.pop() { - Some(h) => h, + pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range)> { + let read_desc = match self.planned_reads_by_lsn.pop() { + Some(desc) => desc, None => return None, }; - let removed = self.layers.remove_entry(&handle.0); + let removed = self.layers.remove_entry(&read_desc.layer_id); match removed { - Some((layer, keyspace)) => Some((layer, keyspace)), + Some(( + _, + LayerKeyspace { + layer, + target_keyspace, + }, + )) => Some((layer, target_keyspace, read_desc.lsn_range)), None => unreachable!("fringe internals are always consistent"), } } - pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) { - let entry = self.layers.entry(layer.clone()); + pub(crate) fn update( + &mut self, + layer: ReadableLayer, + keyspace: KeySpace, + lsn_range: Range, + ) { + let layer_id = layer.id(); + let entry = self.layers.entry(layer_id.clone()); match entry { Entry::Occupied(mut entry) => { - entry.get_mut().merge(&keyspace); + entry.get_mut().target_keyspace.merge(&keyspace); } Entry::Vacant(entry) => { - self.layers_by_lsn - .push(ReadableLayerDescOrdered(entry.key().clone())); - entry.insert(keyspace); + self.planned_reads_by_lsn.push(ReadDesc { + lsn_range, + layer_id: layer_id.clone(), + }); + entry.insert(LayerKeyspace { + layer, + target_keyspace: keyspace, + }); } } } @@ -277,77 +307,55 @@ impl Default for LayerFringe { } } -impl Ord for ReadableLayerDescOrdered { +impl Ord for ReadDesc { fn cmp(&self, other: &Self) -> Ordering { - let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil()); + let ord = self.lsn_range.end.cmp(&other.lsn_range.end); if ord == std::cmp::Ordering::Equal { - self.0 - .get_lsn_floor() - .cmp(&other.0.get_lsn_floor()) - .reverse() + self.lsn_range.start.cmp(&other.lsn_range.start).reverse() } else { ord } } } -impl PartialOrd for ReadableLayerDescOrdered { +impl PartialOrd for ReadDesc { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl PartialEq for ReadableLayerDescOrdered { +impl PartialEq for ReadDesc { fn eq(&self, other: &Self) -> bool { - self.0.get_lsn_floor() == other.0.get_lsn_floor() - && self.0.get_lsn_ceil() == other.0.get_lsn_ceil() + self.lsn_range == other.lsn_range } } -impl Eq for ReadableLayerDescOrdered {} +impl Eq for ReadDesc {} -impl ReadableLayerDesc { - pub(crate) fn get_lsn_floor(&self) -> Lsn { +impl ReadableLayer { + pub(crate) fn id(&self) -> LayerId { match self { - ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start, - ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(), - } - } - - pub(crate) fn get_lsn_ceil(&self) -> Lsn { - match self { - ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end, - ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil, + Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()), + Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()), } } pub(crate) async fn get_values_reconstruct_data( &self, - layer_manager: &LayerManager, keyspace: KeySpace, + lsn_range: Range, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { match self { - ReadableLayerDesc::Persistent { desc, lsn_range } => { - let layer = layer_manager.get_from_desc(desc); + ReadableLayer::PersistentLayer(layer) => { layer - .get_values_reconstruct_data( - keyspace, - lsn_range.clone(), - reconstruct_state, - ctx, - ) + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx) .await } - ReadableLayerDesc::InMemory { handle, lsn_ceil } => { - let layer = layer_manager - .layer_map() - .get_in_memory_layer(handle) - .unwrap(); - + ReadableLayer::InMemoryLayer(layer) => { layer - .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx) + .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx) .await } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index b7132ee3bf..255855a246 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -47,6 +47,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::BytesMut; use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; +use itertools::Itertools; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::TenantShardId; @@ -938,7 +939,7 @@ impl DeltaLayerInner { } if !range_end_handled { - tracing::info!("Handling range end fallback at {}", data_end_offset); + tracing::debug!("Handling range end fallback at {}", data_end_offset); planner.handle_range_end(data_end_offset); } } @@ -946,6 +947,34 @@ impl DeltaLayerInner { Ok(planner.finish()) } + fn get_min_read_buffer_size( + planned_reads: &[VectoredRead], + read_size_soft_max: usize, + ) -> usize { + let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else { + return read_size_soft_max; + }; + + let largest_read_size = largest_read.size(); + if largest_read_size > read_size_soft_max { + // If the read is oversized, it should only contain one key. + let offenders = largest_read + .blobs_at + .as_slice() + .iter() + .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn)) + .join(", "); + tracing::warn!( + "Oversized vectored read ({} > {}) for keys {}", + largest_read_size, + read_size_soft_max, + offenders + ); + } + + largest_read_size + } + async fn do_reads_and_update_state( &self, reads: Vec, @@ -959,7 +988,8 @@ impl DeltaLayerInner { .expect("Layer is loaded with max vectored bytes config") .0 .into(); - let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes)); + let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes); + let mut buf = Some(BytesMut::with_capacity(buf_size)); // Note that reads are processed in reverse order (from highest key+lsn). // This is the order that `ReconstructState` requires such that it can @@ -986,7 +1016,7 @@ impl DeltaLayerInner { // We have "lost" the buffer since the lower level IO api // doesn't return the buffer on error. Allocate a new one. - buf = Some(BytesMut::with_capacity(max_vectored_read_bytes)); + buf = Some(BytesMut::with_capacity(buf_size)); continue; } @@ -1210,9 +1240,16 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del mod test { use std::collections::BTreeMap; + use itertools::MinMaxResult; + use rand::prelude::{SeedableRng, SliceRandom, StdRng}; + use rand::RngCore; + use super::*; use crate::{ - context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk, + context::DownloadBehavior, + task_mgr::TaskKind, + tenant::{disk_btree::tests::TestDisk, harness::TenantHarness}, + DEFAULT_PG_VERSION, }; /// Construct an index for a fictional delta layer and and then @@ -1332,4 +1369,229 @@ mod test { assert_eq!(planned_blobs, expected_blobs); } + + mod constants { + use utils::lsn::Lsn; + + /// Offset used by all lsns in this test + pub(super) const LSN_OFFSET: Lsn = Lsn(0x08); + /// Number of unique keys including in the test data + pub(super) const KEY_COUNT: u8 = 60; + /// Max number of different lsns for each key + pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20; + /// Possible value sizes for each key along with a probability weight + pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)]; + /// Probability that there will be a gap between the current key and the next one (33.3%) + pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)]; + /// The minimum size of a key range in all the generated reads + pub(super) const MIN_RANGE_SIZE: i128 = 10; + /// The number of ranges included in each vectored read + pub(super) const RANGES_COUNT: u8 = 2; + /// The number of vectored reads performed + pub(super) const READS_COUNT: u8 = 100; + /// Soft max size of a vectored read. Will be violated if we have to read keys + /// with values larger than the limit + pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024; + } + + struct Entry { + key: Key, + lsn: Lsn, + value: Vec, + } + + fn generate_entries(rng: &mut StdRng) -> Vec { + let mut current_key = Key::MIN; + + let mut entries = Vec::new(); + for _ in 0..constants::KEY_COUNT { + let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY); + let mut lsns_iter = + std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| { + Some(Lsn(lsn.0 + 0x08)) + }); + let mut lsns = Vec::new(); + while lsns.len() < count as usize { + let take = rng.gen_bool(0.5); + let lsn = lsns_iter.next().unwrap(); + if take { + lsns.push(lsn); + } + } + + for lsn in lsns { + let size = constants::VALUE_SIZES + .choose_weighted(rng, |item| item.1) + .unwrap() + .0; + let mut buf = vec![0; size]; + rng.fill_bytes(&mut buf); + + entries.push(Entry { + key: current_key, + lsn, + value: buf, + }) + } + + let gap = constants::KEY_GAP_CHANGES + .choose_weighted(rng, |item| item.1) + .unwrap() + .0; + if gap { + current_key = current_key.add(2); + } else { + current_key = current_key.add(1); + } + } + + entries + } + + struct EntriesMeta { + key_range: Range, + lsn_range: Range, + index: BTreeMap<(Key, Lsn), Vec>, + } + + fn get_entries_meta(entries: &[Entry]) -> EntriesMeta { + let key_range = match entries.iter().minmax_by_key(|e| e.key) { + MinMaxResult::MinMax(min, max) => min.key..max.key.next(), + _ => panic!("More than one entry is always expected"), + }; + + let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) { + MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1), + _ => panic!("More than one entry is always expected"), + }; + + let mut index = BTreeMap::new(); + for entry in entries.iter() { + index.insert((entry.key, entry.lsn), entry.value.clone()); + } + + EntriesMeta { + key_range, + lsn_range, + index, + } + } + + fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range) -> KeySpace { + let start = key_range.start.to_i128(); + let end = key_range.end.to_i128(); + + let mut keyspace = KeySpace::default(); + + for _ in 0..constants::RANGES_COUNT { + let mut range: Option> = Option::default(); + while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) { + let range_start = rng.gen_range(start..end); + let range_end_offset = range_start + constants::MIN_RANGE_SIZE; + if range_end_offset >= end { + range = Some(Key::from_i128(range_start)..Key::from_i128(end)); + } else { + let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end); + range = Some(Key::from_i128(range_start)..Key::from_i128(range_end)); + } + } + keyspace.ranges.push(range.unwrap()); + } + + keyspace + } + + #[tokio::test] + async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?; + let (tenant, ctx) = harness.load().await; + + let timeline_id = TimelineId::generate(); + let timeline = tenant + .create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx) + .await?; + + tracing::info!("Generating test data ..."); + + let rng = &mut StdRng::seed_from_u64(0); + let entries = generate_entries(rng); + let entries_meta = get_entries_meta(&entries); + + tracing::info!("Done generating {} entries", entries.len()); + + tracing::info!("Writing test data to delta layer ..."); + let mut writer = DeltaLayerWriter::new( + harness.conf, + timeline_id, + harness.tenant_shard_id, + entries_meta.key_range.start, + entries_meta.lsn_range.clone(), + ) + .await?; + + for entry in entries { + let (_, res) = writer + .put_value_bytes(entry.key, entry.lsn, entry.value, false) + .await; + res?; + } + + let resident = writer.finish(entries_meta.key_range.end, &timeline).await?; + + let inner = resident.get_inner_delta(&ctx).await?; + + let file_size = inner.file.metadata().await?.len(); + tracing::info!( + "Done writing test data to delta layer. Resulting file size is: {}", + file_size + ); + + for i in 0..constants::READS_COUNT { + tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT); + + let block_reader = FileBlockReader::new(&inner.file, inner.file_id); + let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + block_reader, + ); + + let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES); + let mut reconstruct_state = ValuesReconstructState::new(); + let keyspace = pick_random_keyspace(rng, &entries_meta.key_range); + let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64; + + let vectored_reads = DeltaLayerInner::plan_reads( + keyspace.clone(), + entries_meta.lsn_range.clone(), + data_end_offset, + index_reader, + planner, + &mut reconstruct_state, + &ctx, + ) + .await?; + + let vectored_blob_reader = VectoredBlobReader::new(&inner.file); + let buf_size = DeltaLayerInner::get_min_read_buffer_size( + &vectored_reads, + constants::MAX_VECTORED_READ_BYTES, + ); + let mut buf = Some(BytesMut::with_capacity(buf_size)); + + for read in vectored_reads { + let blobs_buf = vectored_blob_reader + .read_blobs(&read, buf.take().expect("Should have a buffer")) + .await?; + for meta in blobs_buf.blobs.iter() { + let value = &blobs_buf.buf[meta.start..meta.end]; + assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]); + } + + buf = Some(blobs_buf.buf); + } + } + + Ok(()) + } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 14c79e413c..5b44d2bc2c 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -44,6 +44,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::{Bytes, BytesMut}; use camino::{Utf8Path, Utf8PathBuf}; use hex; +use itertools::Itertools; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::TenantShardId; @@ -540,7 +541,25 @@ impl ImageLayerInner { let vectored_blob_reader = VectoredBlobReader::new(&self.file); for read in reads.into_iter() { - let buf = BytesMut::with_capacity(max_vectored_read_bytes); + let buf_size = read.size(); + + if buf_size > max_vectored_read_bytes { + // If the read is oversized, it should only contain one key. + let offenders = read + .blobs_at + .as_slice() + .iter() + .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn)) + .join(", "); + tracing::warn!( + "Oversized vectored read ({} > {}) for keys {}", + buf_size, + max_vectored_read_bytes, + offenders + ); + } + + let buf = BytesMut::with_capacity(buf_size); let res = vectored_blob_reader.read_blobs(&read, buf).await; match res { diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 628f12065f..29751641b4 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -12,13 +12,14 @@ use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::ValueReconstructResult; use crate::tenant::timeline::GetVectoredError; use crate::tenant::{PageReconstructError, Timeline}; -use crate::walrecord; +use crate::{page_cache, walrecord}; use anyhow::{anyhow, ensure, Result}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; use std::collections::{BinaryHeap, HashMap, HashSet}; use std::sync::{Arc, OnceLock}; +use std::time::Instant; use tracing::*; use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; // avoid binding to Write (conflicts with std::io::Write) @@ -36,10 +37,14 @@ use super::{ ValuesReconstructState, }; +#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] +pub(crate) struct InMemoryLayerFileId(page_cache::FileId); + pub struct InMemoryLayer { conf: &'static PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + file_id: InMemoryLayerFileId, /// This layer contains all the changes from 'start_lsn'. The /// start is inclusive. @@ -49,6 +54,8 @@ pub struct InMemoryLayer { /// Writes are only allowed when this is `None`. end_lsn: OnceLock, + opened_at: Instant, + /// The above fields never change, except for `end_lsn`, which is only set once. /// All other changing parts are in `inner`, and protected by a mutex. inner: RwLock, @@ -200,6 +207,10 @@ pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources { }; impl InMemoryLayer { + pub(crate) fn file_id(&self) -> InMemoryLayerFileId { + self.file_id + } + pub(crate) fn get_timeline_id(&self) -> TimelineId { self.timeline_id } @@ -443,13 +454,16 @@ impl InMemoryLayer { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?; + let key = InMemoryLayerFileId(file.id()); Ok(InMemoryLayer { + file_id: key, conf, timeline_id, tenant_shard_id, start_lsn, end_lsn: OnceLock::new(), + opened_at: Instant::now(), inner: RwLock::new(InMemoryLayerInner { index: HashMap::new(), file, @@ -510,6 +524,10 @@ impl InMemoryLayer { Ok(()) } + pub(crate) fn get_opened_at(&self) -> Instant { + self.opened_at + } + pub(crate) async fn tick(&self) -> Option { let mut inner = self.inner.write().await; let size = inner.file.len(); diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 8ba37b5a86..27e60f783c 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1759,6 +1759,18 @@ impl ResidentLayer { pub(crate) fn metadata(&self) -> LayerFileMetadata { self.owner.metadata() } + + #[cfg(test)] + pub(crate) async fn get_inner_delta<'a>( + &'a self, + ctx: &RequestContext, + ) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> { + let owner = &self.owner.0; + match self.downloaded.get(owner, ctx).await? { + LayerKind::Delta(d) => Ok(d), + LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")), + } + } } impl AsLayerDesc for ResidentLayer { diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index e4f5f75132..74ed677ffe 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -18,7 +18,7 @@ use utils::{backoff, completion}; static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { - let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS; + let total_threads = task_mgr::TOKIO_WORKER_THREADS.get(); let permits = usize::max( 1, // while a lot of the work is done on spawn_blocking, we still do @@ -72,6 +72,7 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit( loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation ); + // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id(); match CONCURRENT_BACKGROUND_TASKS.acquire().await { Ok(permit) => permit, Err(_closed) => unreachable!("we never close the semaphore"), diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index bc3fc1df1f..46b3d41e2b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -9,6 +9,7 @@ pub mod uninit; mod walreceiver; use anyhow::{anyhow, bail, ensure, Context, Result}; +use arc_swap::ArcSwap; use bytes::Bytes; use camino::Utf8Path; use enumset::EnumSet; @@ -118,11 +119,11 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::remote_timeline_client::RemoteTimelineClient; +use super::config::TenantConf; use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline}; -use super::{config::TenantConf, storage_layer::ReadableLayerDesc}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; +use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub(super) enum FlushLoopState { @@ -183,7 +184,7 @@ pub(crate) struct AuxFilesState { pub struct Timeline { conf: &'static PageServerConf, - tenant_conf: Arc>, + tenant_conf: Arc>, myself: Weak, @@ -281,10 +282,12 @@ pub struct Timeline { pub(super) flush_loop_state: Mutex, /// layer_flush_start_tx can be used to wake up the layer-flushing task. - /// The value is a counter, incremented every time a new flush cycle is requested. - /// The flush cycle counter is sent back on the layer_flush_done channel when - /// the flush finishes. You can use that to wait for the flush to finish. - layer_flush_start_tx: tokio::sync::watch::Sender, + /// - The u64 value is a counter, incremented every time a new flush cycle is requested. + /// The flush cycle counter is sent back on the layer_flush_done channel when + /// the flush finishes. You can use that to wait for the flush to finish. + /// - The LSN is updated to max() of its current value and the latest disk_consistent_lsn + /// read by whoever sends an update + layer_flush_start_tx: tokio::sync::watch::Sender<(u64, Lsn)>, /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>, @@ -309,6 +312,8 @@ pub struct Timeline { /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, + last_image_layer_creation_check_at: AtomicLsn, + /// Current logical size of the "datadir", at the last LSN. current_logical_size: LogicalSize, @@ -610,6 +615,25 @@ pub enum GetVectoredImpl { Vectored, } +pub(crate) enum WaitLsnWaiter<'a> { + Timeline(&'a Timeline), + Tenant, + PageService, +} + +/// Argument to [`Timeline::shutdown`]. +#[derive(Debug, Clone, Copy)] +pub(crate) enum ShutdownMode { + /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then + /// also to remote storage. This method can easily take multiple seconds for a busy timeline. + /// + /// While we are flushing, we continue to accept read I/O for LSNs ingested before + /// the call to [`Timeline::shutdown`]. + FreezeAndFlush, + /// Shut down immediately, without waiting for any open layers to flush. + Hard, +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -1058,7 +1082,8 @@ impl Timeline { pub(crate) async fn wait_lsn( &self, lsn: Lsn, - _ctx: &RequestContext, /* Prepare for use by cancellation */ + who_is_waiting: WaitLsnWaiter<'_>, + ctx: &RequestContext, /* Prepare for use by cancellation */ ) -> Result<(), WaitLsnError> { if self.cancel.is_cancelled() { return Err(WaitLsnError::Shutdown); @@ -1066,20 +1091,28 @@ impl Timeline { return Err(WaitLsnError::BadState); } - // This should never be called from the WAL receiver, because that could lead - // to a deadlock. - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager), - "wait_lsn cannot be called in WAL receiver" - ); - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler), - "wait_lsn cannot be called in WAL receiver" - ); - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller), - "wait_lsn cannot be called in WAL receiver" - ); + if cfg!(debug_assertions) { + match ctx.task_kind() { + TaskKind::WalReceiverManager + | TaskKind::WalReceiverConnectionHandler + | TaskKind::WalReceiverConnectionPoller => { + let is_myself = match who_is_waiting { + WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself), + WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()), + }; + if is_myself { + if let Err(current) = self.last_record_lsn.would_wait_for(lsn) { + // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here + panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock"); + } + } else { + // if another timeline's is waiting for us, there's no deadlock risk because + // our walreceiver task can make progress independent of theirs + } + } + _ => {} + } + } let _timer = crate::metrics::WAIT_LSN_TIME.start_timer(); @@ -1138,8 +1171,8 @@ impl Timeline { /// Flush to disk all data that was written with the put_* functions #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> { - self.freeze_inmem_layer(false).await; - self.flush_frozen_layers_and_wait().await + let to_lsn = self.freeze_inmem_layer(false).await; + self.flush_frozen_layers_and_wait(to_lsn).await } /// If there is no writer, and conditions for rolling the latest layer are met, then freeze it. @@ -1159,7 +1192,39 @@ impl Timeline { }; let Some(open_layer) = &layers_guard.layer_map().open_layer else { - // No open layer, no work to do. + // If there is no open layer, we have no layer freezing to do. However, we might need to generate + // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions + // that didn't result in writes to this shard. + + // Must not hold the layers lock while waiting for a flush. + drop(layers_guard); + + let last_record_lsn = self.get_last_record_lsn(); + let disk_consistent_lsn = self.get_disk_consistent_lsn(); + if last_record_lsn > disk_consistent_lsn { + // We have no open layer, but disk_consistent_lsn is behind the last record: this indicates + // we are a sharded tenant and have skipped some WAL + let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); + if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() { + // This should be somewhat rare, so we log it at INFO level. + // + // We checked for checkpoint timeout so that a shard without any + // data ingested (yet) doesn't write a remote index as soon as it + // sees its LSN advance: we only do this if we've been layer-less + // for some time. + tracing::info!( + "Advancing disk_consistent_lsn past WAL ingest gap {} -> {}", + disk_consistent_lsn, + last_record_lsn + ); + + // The flush loop will update remote consistent LSN as well as disk consistent LSN. + self.flush_frozen_layers_and_wait(last_record_lsn) + .await + .ok(); + } + } + return; }; @@ -1192,7 +1257,7 @@ impl Timeline { checkpoint_distance, self.get_last_record_lsn(), self.last_freeze_at.load(), - *self.last_freeze_ts.read().unwrap(), + open_layer.get_opened_at(), ) { match open_layer.info() { InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => { @@ -1279,7 +1344,7 @@ impl Timeline { background_jobs_can_start: Option<&completion::Barrier>, ctx: &RequestContext, ) { - if self.tenant_shard_id.is_zero() { + if self.tenant_shard_id.is_shard_zero() { // Logical size is only maintained accurately on shard zero. self.spawn_initial_logical_size_computation_task(ctx); } @@ -1288,83 +1353,119 @@ impl Timeline { self.launch_eviction_task(parent, background_jobs_can_start); } - /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then - /// also to remote storage. This method can easily take multiple seconds for a busy timeline. + /// After this function returns, there are no timeline-scoped tasks are left running. /// - /// While we are flushing, we continue to accept read I/O. - pub(crate) async fn flush_and_shutdown(&self) { + /// The preferred pattern for is: + /// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token + /// - if early shutdown (not just cancellation) of a sub-tree of tasks is required, + /// go the extra mile and keep track of JoinHandles + /// - Keep track of JoinHandles using a passed-down `Arc>>` or similar, + /// instead of spawning directly on a runtime. It is a more composable / testable pattern. + /// + /// For legacy reasons, we still have multiple tasks spawned using + /// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`. + /// We refer to these as "timeline-scoped task_mgr tasks". + /// Some of these tasks are already sensitive to Timeline::cancel while others are + /// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`] + /// or [`task_mgr::shutdown_watcher`]. + /// We want to gradually convert the code base away from these. + /// + /// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to + /// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped + /// ones that aren't mentioned here): + /// - [`TaskKind::TimelineDeletionWorker`] + /// - NB: also used for tenant deletion + /// - [`TaskKind::RemoteUploadTask`]` + /// - [`TaskKind::InitialLogicalSizeCalculation`] + /// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?) + // Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive: + /// - [`TaskKind::Eviction`] + /// - [`TaskKind::LayerFlushTask`] + /// - [`TaskKind::OndemandLogicalSizeCalculation`] + /// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped) + pub(crate) async fn shutdown(&self, mode: ShutdownMode) { debug_assert_current_span_has_tenant_and_timeline_id(); - // Stop ingesting data, so that we are not still writing to an InMemoryLayer while - // trying to flush - tracing::debug!("Waiting for WalReceiverManager..."); - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; + let try_freeze_and_flush = match mode { + ShutdownMode::FreezeAndFlush => true, + ShutdownMode::Hard => false, + }; - // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance + // Regardless of whether we're going to try_freeze_and_flush + // or not, stop ingesting any more data. Walreceiver only provides + // cancellation but no "wait until gone", because it uses the Timeline::gate. + // So, only after the self.gate.close() below will we know for sure that + // no walreceiver tasks are left. + // For `try_freeze_and_flush=true`, this means that we might still be ingesting + // data during the call to `self.freeze_and_flush()` below. + // That's not ideal, but, we don't have the concept of a ChildGuard, + // which is what we'd need to properly model early shutdown of the walreceiver + // task sub-tree before the other Timeline task sub-trees. + let walreceiver = self.walreceiver.lock().unwrap().take(); + tracing::debug!( + is_some = walreceiver.is_some(), + "Waiting for WalReceiverManager..." + ); + if let Some(walreceiver) = walreceiver { + walreceiver.cancel(); + } + // ... and inform any waiters for newer LSNs that there won't be any. self.last_record_lsn.shutdown(); - // now all writers to InMemory layer are gone, do the final flush if requested - match self.freeze_and_flush().await { - Ok(_) => { - // drain the upload queue - if let Some(client) = self.remote_client.as_ref() { - // if we did not wait for completion here, it might be our shutdown process - // didn't wait for remote uploads to complete at all, as new tasks can forever - // be spawned. - // - // what is problematic is the shutting down of RemoteTimelineClient, because - // obviously it does not make sense to stop while we wait for it, but what - // about corner cases like s3 suddenly hanging up? - client.shutdown().await; + if try_freeze_and_flush { + // we shut down walreceiver above, so, we won't add anything more + // to the InMemoryLayer; freeze it and wait for all frozen layers + // to reach the disk & upload queue, then shut the upload queue and + // wait for it to drain. + match self.freeze_and_flush().await { + Ok(_) => { + // drain the upload queue + if let Some(client) = self.remote_client.as_ref() { + // if we did not wait for completion here, it might be our shutdown process + // didn't wait for remote uploads to complete at all, as new tasks can forever + // be spawned. + // + // what is problematic is the shutting down of RemoteTimelineClient, because + // obviously it does not make sense to stop while we wait for it, but what + // about corner cases like s3 suddenly hanging up? + client.shutdown().await; + } + } + Err(e) => { + // Non-fatal. Shutdown is infallible. Failures to flush just mean that + // we have some extra WAL replay to do next time the timeline starts. + warn!("failed to freeze and flush: {e:#}"); } } - Err(e) => { - // Non-fatal. Shutdown is infallible. Failures to flush just mean that - // we have some extra WAL replay to do next time the timeline starts. - warn!("failed to freeze and flush: {e:#}"); - } } - self.shutdown().await; - } - - /// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of - /// the graceful [`Timeline::flush_and_shutdown`] function. - pub(crate) async fn shutdown(&self) { - debug_assert_current_span_has_tenant_and_timeline_id(); - // Signal any subscribers to our cancellation token to drop out tracing::debug!("Cancelling CancellationToken"); self.cancel.cancel(); - // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel - // while doing so. - self.last_record_lsn.shutdown(); - - // Shut down the layer flush task before the remote client, as one depends on the other - task_mgr::shutdown_tasks( - Some(TaskKind::LayerFlushTask), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; - - // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in - // case our caller wants to use that for a deletion + // Transition the remote_client into a state where it's only useful for timeline deletion. + // (The deletion use case is why we can't just hook up remote_client to Self::cancel).) if let Some(remote_client) = self.remote_client.as_ref() { remote_client.stop(); + // As documented in remote_client.stop()'s doc comment, it's our responsibility + // to shut down the upload queue tasks. + // TODO: fix that, task management should be encapsulated inside remote_client. + task_mgr::shutdown_tasks( + Some(TaskKind::RemoteUploadTask), + Some(self.tenant_shard_id), + Some(self.timeline_id), + ) + .await; } + // TODO: work toward making this a no-op. See this funciton's doc comment for more context. tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await; - // Finally wait until any gate-holders are complete + // Finally wait until any gate-holders are complete. + // + // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks + // and use a TBD variant of shutdown_tasks that asserts that there were no tasks left. self.gate.close().await; self.metrics.shutdown(); @@ -1521,7 +1622,7 @@ impl Timeline { checkpoint_distance: u64, projected_lsn: Lsn, last_freeze_at: Lsn, - last_freeze_ts: Instant, + opened_at: Instant, ) -> bool { let distance = projected_lsn.widening_sub(last_freeze_at); @@ -1547,13 +1648,13 @@ impl Timeline { ); true - } else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() { + } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() { info!( - "Will roll layer at {} with layer size {} due to time since last flush ({:?})", - projected_lsn, - layer_size, - last_freeze_ts.elapsed() - ); + "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})", + projected_lsn, + layer_size, + opened_at.elapsed() + ); true } else { @@ -1568,57 +1669,65 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10; // Private functions impl Timeline { pub(crate) fn get_lazy_slru_download(&self) -> bool { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .lazy_slru_download .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download) } fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } fn get_checkpoint_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } fn get_compaction_algorithm(&self) -> CompactionAlgorithm { - let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = &self.tenant_conf.load(); tenant_conf + .tenant_conf .compaction_algorithm .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm) } fn get_eviction_policy(&self) -> EvictionPolicy { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .eviction_policy .unwrap_or(self.conf.default_tenant_conf.eviction_policy) } @@ -1632,14 +1741,26 @@ impl Timeline { .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold) } - pub(super) fn tenant_conf_updated(&self) { + fn get_image_layer_creation_check_threshold(&self) -> u8 { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .image_layer_creation_check_threshold + .unwrap_or( + self.conf + .default_tenant_conf + .image_layer_creation_check_threshold, + ) + } + + pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. // The threshold is embedded in the metric. So, we need to update it. { let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold( - &self.tenant_conf.read().unwrap().tenant_conf, + new_conf, &self.conf.default_tenant_conf, ); @@ -1666,7 +1787,7 @@ impl Timeline { #[allow(clippy::too_many_arguments)] pub(super) fn new( conf: &'static PageServerConf, - tenant_conf: Arc>, + tenant_conf: Arc>, metadata: &TimelineMetadata, ancestor: Option>, timeline_id: TimelineId, @@ -1682,17 +1803,16 @@ impl Timeline { let disk_consistent_lsn = metadata.disk_consistent_lsn(); let (state, _) = watch::channel(state); - let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0); + let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn)); let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); - let tenant_conf_guard = tenant_conf.read().unwrap(); - - let evictions_low_residence_duration_metric_threshold = + let evictions_low_residence_duration_metric_threshold = { + let loaded_tenant_conf = tenant_conf.load(); Self::get_evictions_low_residence_duration_metric_threshold( - &tenant_conf_guard.tenant_conf, + &loaded_tenant_conf.tenant_conf, &conf.default_tenant_conf, - ); - drop(tenant_conf_guard); + ) + }; Arc::new_cyclic(|myself| { let mut result = Timeline { @@ -1769,6 +1889,7 @@ impl Timeline { }, partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))), repartition_threshold: 0, + last_image_layer_creation_check_at: AtomicLsn::new(0), last_received_wal: Mutex::new(None), rel_size_cache: RwLock::new(HashMap::new()), @@ -1797,6 +1918,7 @@ impl Timeline { }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; + result .metrics .last_record_gauge @@ -1873,20 +1995,19 @@ impl Timeline { self.timeline_id, self.tenant_shard_id ); - let tenant_conf_guard = self.tenant_conf.read().unwrap(); - let wal_connect_timeout = tenant_conf_guard + let tenant_conf = self.tenant_conf.load(); + let wal_connect_timeout = tenant_conf .tenant_conf .walreceiver_connect_timeout .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); - let lagging_wal_timeout = tenant_conf_guard + let lagging_wal_timeout = tenant_conf .tenant_conf .lagging_wal_timeout .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); - let max_lsn_wal_lag = tenant_conf_guard + let max_lsn_wal_lag = tenant_conf .tenant_conf .max_lsn_wal_lag .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); - drop(tenant_conf_guard); let mut guard = self.walreceiver.lock().unwrap(); assert!( @@ -2116,7 +2237,7 @@ impl Timeline { priority: GetLogicalSizePriority, ctx: &RequestContext, ) -> logical_size::CurrentLogicalSize { - if !self.tenant_shard_id.is_zero() { + if !self.tenant_shard_id.is_shard_zero() { // Logical size is only accurately maintained on shard zero: when called elsewhere, for example // when HTTP API is serving a GET for timeline zero, return zero return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero()); @@ -2412,7 +2533,7 @@ impl Timeline { crate::span::debug_assert_current_span_has_tenant_and_timeline_id(); // We should never be calculating logical sizes on shard !=0, because these shards do not have // accurate relation sizes, and they do not emit consumption metrics. - debug_assert!(self.tenant_shard_id.is_zero()); + debug_assert!(self.tenant_shard_id.is_shard_zero()); let guard = self .gate @@ -2434,10 +2555,6 @@ impl Timeline { debug!("cancelling logical size calculation for timeline shutdown"); calculation.await } - _ = task_mgr::shutdown_watcher() => { - debug!("cancelling logical size calculation for task shutdown"); - calculation.await - } } } @@ -2892,16 +3009,6 @@ impl Timeline { let mut completed_keyspace = KeySpace::default(); - // Hold the layer map whilst visiting the timeline to prevent - // compaction, eviction and flushes from rendering the layers unreadable. - // - // TODO: Do we actually need to do this? In theory holding on - // to [`tenant::storage_layer::Layer`] should be enough. However, - // [`Timeline::get`] also holds the lock during IO, so more investigation - // is needed. - let guard = timeline.layers.read().await; - let layers = guard.layer_map(); - loop { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); @@ -2911,6 +3018,9 @@ impl Timeline { unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); completed_keyspace.merge(&keys_done_last_step); + let guard = timeline.layers.read().await; + let layers = guard.layer_map(); + let in_memory_layer = layers.find_in_memory_layer(|l| { let start_lsn = l.get_lsn_range().start; cont_lsn > start_lsn @@ -2918,12 +3028,11 @@ impl Timeline { match in_memory_layer { Some(l) => { + let lsn_range = l.get_lsn_range().start..cont_lsn; fringe.update( - ReadableLayerDesc::InMemory { - handle: l, - lsn_ceil: cont_lsn, - }, + ReadableLayer::InMemoryLayer(l), unmapped_keyspace.clone(), + lsn_range, ); } None => { @@ -2935,30 +3044,43 @@ impl Timeline { .into_iter() .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { ( - ReadableLayerDesc::Persistent { - desc: (*layer).clone(), - lsn_range: lsn_floor..cont_lsn, - }, + ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)), keyspace_accum.to_keyspace(), + lsn_floor..cont_lsn, ) }) - .for_each(|(layer, keyspace)| fringe.update(layer, keyspace)); + .for_each(|(layer, keyspace, lsn_range)| { + fringe.update(layer, keyspace, lsn_range) + }); } } } - if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() { + // It's safe to drop the layer map lock after planning the next round of reads. + // The fringe keeps readable handles for the layers which are safe to read even + // if layers were compacted or flushed. + // + // The more interesting consideration is: "Why is the read algorithm still correct + // if the layer map changes while it is operating?". Doing a vectored read on a + // timeline boils down to pushing an imaginary lsn boundary downwards for each range + // covered by the read. The layer map tells us how to move the lsn downwards for a + // range at *a particular point in time*. It is fine for the answer to be different + // at two different time points. + drop(guard); + + if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { + let next_cont_lsn = lsn_range.start; layer_to_read .get_values_reconstruct_data( - &guard, keyspace_to_read.clone(), + lsn_range, reconstruct_state, ctx, ) .await?; unmapped_keyspace = keyspace_to_read; - cont_lsn = layer_to_read.get_lsn_floor(); + cont_lsn = next_cont_lsn; } else { break; } @@ -3036,7 +3158,7 @@ impl Timeline { } } ancestor - .wait_lsn(self.ancestor_lsn, ctx) + .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx) .await .map_err(|e| match e { e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e), @@ -3086,7 +3208,9 @@ impl Timeline { self.last_record_lsn.advance(new_lsn); } - async fn freeze_inmem_layer(&self, write_lock_held: bool) { + /// Whether there was a layer to freeze or not, return the value of get_last_record_lsn + /// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive). + async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn { // Freeze the current open in-memory layer. It will be written to disk on next // iteration. @@ -3096,7 +3220,9 @@ impl Timeline { Some(self.write_lock.lock().await) }; - self.freeze_inmem_layer_at(self.get_last_record_lsn()).await; + let to_lsn = self.get_last_record_lsn(); + self.freeze_inmem_layer_at(to_lsn).await; + to_lsn } async fn freeze_inmem_layer_at(&self, at: Lsn) { @@ -3109,25 +3235,24 @@ impl Timeline { /// Layer flusher task's main loop. async fn flush_loop( self: &Arc, - mut layer_flush_start_rx: tokio::sync::watch::Receiver, + mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>, ctx: &RequestContext, ) { info!("started flush loop"); loop { tokio::select! { _ = self.cancel.cancelled() => { - info!("shutting down layer flush task"); - break; - }, - _ = task_mgr::shutdown_watcher() => { - info!("shutting down layer flush task"); + info!("shutting down layer flush task due to Timeline::cancel"); break; }, _ = layer_flush_start_rx.changed() => {} } - trace!("waking up"); - let flush_counter = *layer_flush_start_rx.borrow(); + let (flush_counter, frozen_to_lsn) = *layer_flush_start_rx.borrow(); + + // The highest LSN to which we flushed in the loop over frozen layers + let mut flushed_to_lsn = Lsn(0); + let result = loop { if self.cancel.is_cancelled() { info!("dropping out of flush loop for timeline shutdown"); @@ -3148,7 +3273,9 @@ impl Timeline { break Ok(()); }; match self.flush_frozen_layer(layer_to_flush, ctx).await { - Ok(()) => {} + Ok(this_layer_to_lsn) => { + flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn); + } Err(FlushLayerError::Cancelled) => { info!("dropping out of flush loop for timeline shutdown"); return; @@ -3157,11 +3284,36 @@ impl Timeline { FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_), ) => { error!("could not flush frozen layer: {err:?}"); - break err; + break err.map(|_| ()); } } timer.stop_and_record(); }; + + // Unsharded tenants should never advance their LSN beyond the end of the + // highest layer they write: such gaps between layer data and the frozen LSN + // are only legal on sharded tenants. + debug_assert!( + self.shard_identity.count.count() > 1 + || flushed_to_lsn >= frozen_to_lsn + || !flushed_to_lsn.is_valid() + ); + + if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 { + // If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised + // to us via layer_flush_start_rx, then advance it here. + // + // This path is only taken for tenants with multiple shards: single sharded tenants should + // never encounter a gap in the wal. + let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); + tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}"); + if self.set_disk_consistent_lsn(frozen_to_lsn) { + if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) { + tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}"); + } + } + } + // Notify any listeners that we're done let _ = self .layer_flush_done_tx @@ -3169,7 +3321,13 @@ impl Timeline { } } - async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> { + /// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk. + /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`]. + /// + /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case, + /// it means no data will be written between the top of the highest frozen layer and to_lsn, + /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL. + async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> { let mut rx = self.layer_flush_done_tx.subscribe(); // Increment the flush cycle counter and wake up the flush task. @@ -3183,9 +3341,10 @@ impl Timeline { anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}") } - self.layer_flush_start_tx.send_modify(|counter| { + self.layer_flush_start_tx.send_modify(|(counter, lsn)| { my_flush_request = *counter + 1; *counter = my_flush_request; + *lsn = std::cmp::max(last_record_lsn, *lsn); }); loop { @@ -3222,16 +3381,22 @@ impl Timeline { } fn flush_frozen_layers(&self) { - self.layer_flush_start_tx.send_modify(|val| *val += 1); + self.layer_flush_start_tx.send_modify(|(counter, lsn)| { + *counter += 1; + + *lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1)); + }); } /// Flush one frozen in-memory layer to disk, as a new delta layer. + /// + /// Return value is the last lsn (inclusive) of the layer that was frozen. #[instrument(skip_all, fields(layer=%frozen_layer))] async fn flush_frozen_layer( self: &Arc, frozen_layer: Arc, ctx: &RequestContext, - ) -> Result<(), FlushLayerError> { + ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); // As a special case, when we have just imported an image into the repository, @@ -3306,7 +3471,6 @@ impl Timeline { } let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); - let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); // The new on-disk layers are now in the layer map. We can remove the // in-memory layer from the map now. The flushed layer is stored in @@ -3320,10 +3484,7 @@ impl Timeline { guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics); - if disk_consistent_lsn != old_disk_consistent_lsn { - assert!(disk_consistent_lsn > old_disk_consistent_lsn); - self.disk_consistent_lsn.store(disk_consistent_lsn); - + if self.set_disk_consistent_lsn(disk_consistent_lsn) { // Schedule remote uploads that will reflect our new disk_consistent_lsn self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?; } @@ -3340,7 +3501,22 @@ impl Timeline { // This failpoint is used by another test case `test_pageserver_recovery`. fail_point!("flush-frozen-exit"); - Ok(()) + Ok(Lsn(lsn_range.end.0 - 1)) + } + + /// Return true if the value changed + /// + /// This function must only be used from the layer flush task, and may not be called concurrently. + fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool { + // We do a simple load/store cycle: that's why this function isn't safe for concurrent use. + let old_value = self.disk_consistent_lsn.load(); + if new_value != old_value { + assert!(new_value >= old_value); + self.disk_consistent_lsn.store(new_value); + true + } else { + false + } } /// Update metadata file @@ -3501,6 +3677,24 @@ impl Timeline { // Is it time to create a new image layer for the given partition? async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool { + let last = self.last_image_layer_creation_check_at.load(); + if lsn != Lsn(0) { + let distance = lsn + .checked_sub(last) + .expect("Attempt to compact with LSN going backwards"); + + let min_distance = self.get_image_layer_creation_check_threshold() as u64 + * self.get_checkpoint_distance(); + + // Skip the expensive delta layer counting below if we've not ingested + // sufficient WAL since the last check. + if distance.0 < min_distance { + return false; + } + } + + self.last_image_layer_creation_check_at.store(lsn); + let threshold = self.get_image_creation_threshold(); let guard = self.layers.read().await; @@ -3842,6 +4036,24 @@ impl Timeline { Ok(()) } + /// Schedules the uploads of the given image layers + fn upload_new_image_layers( + self: &Arc, + new_images: impl IntoIterator, + ) -> anyhow::Result<()> { + let Some(remote_client) = &self.remote_client else { + return Ok(()); + }; + for layer in new_images { + remote_client.schedule_layer_file_upload(layer)?; + } + // should any new image layer been created, not uploading index_part will + // result in a mismatch between remote_physical_size and layermap calculated + // size, which will fail some tests, but should not be an issue otherwise. + remote_client.schedule_index_upload_for_file_changes()?; + Ok(()) + } + /// Update information about which layer files need to be retained on /// garbage collection. This is separate from actually performing the GC, /// and is updated more frequently, so that compaction can remove obsolete @@ -4491,23 +4703,16 @@ struct TimelineWriterState { max_lsn: Option, // Cached details of the last freeze. Avoids going trough the atomic/lock on every put. cached_last_freeze_at: Lsn, - cached_last_freeze_ts: Instant, } impl TimelineWriterState { - fn new( - open_layer: Arc, - current_size: u64, - last_freeze_at: Lsn, - last_freeze_ts: Instant, - ) -> Self { + fn new(open_layer: Arc, current_size: u64, last_freeze_at: Lsn) -> Self { Self { open_layer, current_size, prev_lsn: None, max_lsn: None, cached_last_freeze_at: last_freeze_at, - cached_last_freeze_ts: last_freeze_ts, } } } @@ -4606,12 +4811,10 @@ impl<'a> TimelineWriter<'a> { let initial_size = layer.size().await?; let last_freeze_at = self.last_freeze_at.load(); - let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); self.write_guard.replace(TimelineWriterState::new( layer, initial_size, last_freeze_at, - last_freeze_ts, )); Ok(()) @@ -4658,7 +4861,7 @@ impl<'a> TimelineWriter<'a> { self.get_checkpoint_distance(), lsn, state.cached_last_freeze_at, - state.cached_last_freeze_ts, + state.open_layer.get_opened_at(), ) { OpenLayerAction::Roll } else { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 74b75dabf0..8075775bbc 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -12,7 +12,6 @@ use super::layer_manager::LayerManager; use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline}; use anyhow::{anyhow, Context}; -use async_trait::async_trait; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; @@ -125,18 +124,8 @@ impl Timeline { ) .await .map_err(anyhow::Error::from)?; - if let Some(remote_client) = &self.remote_client { - for layer in layers { - remote_client.schedule_layer_file_upload(layer)?; - } - } - if let Some(remote_client) = &self.remote_client { - // should any new image layer been created, not uploading index_part will - // result in a mismatch between remote_physical_size and layermap calculated - // size, which will fail some tests, but should not be an issue otherwise. - remote_client.schedule_index_upload_for_file_changes()?; - } + self.upload_new_image_layers(layers)?; } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -818,7 +807,10 @@ impl TimelineAdaptor { self.timeline .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete) .await?; - self.new_images.clear(); + + self.timeline + .upload_new_image_layers(std::mem::take(&mut self.new_images))?; + self.new_deltas.clear(); self.layers_to_delete.clear(); Ok(()) @@ -1129,7 +1121,6 @@ impl CompactionLayer for ResidentDeltaLayer { } } -#[async_trait] impl CompactionDeltaLayer for ResidentDeltaLayer { type DeltaEntry<'a> = DeltaEntry<'a>; diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index ab0a88c764..af10c1c84b 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -6,7 +6,7 @@ use std::{ use anyhow::Context; use pageserver_api::{models::TimelineState, shard::TenantShardId}; use tokio::sync::OwnedMutexGuard; -use tracing::{debug, error, info, instrument, Instrument}; +use tracing::{error, info, instrument, Instrument}; use utils::{crashsafe, fs_ext, id::TimelineId}; use crate::{ @@ -14,7 +14,6 @@ use crate::{ deletion_queue::DeletionQueueClient, task_mgr::{self, TaskKind}, tenant::{ - debug_assert_current_span_has_tenant_and_timeline_id, metadata::TimelineMetadata, remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, CreateTimelineCause, DeleteTimelineError, Tenant, @@ -23,58 +22,6 @@ use crate::{ use super::{Timeline, TimelineResources}; -/// Now that the Timeline is in Stopping state, request all the related tasks to shut down. -async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { - debug_assert_current_span_has_tenant_and_timeline_id(); - // Notify any timeline work to drop out of loops/requests - tracing::debug!("Cancelling CancellationToken"); - timeline.cancel.cancel(); - - // Stop the walreceiver first. - debug!("waiting for wal receiver to shutdown"); - let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() }; - if let Some(walreceiver) = maybe_started_walreceiver { - walreceiver.stop().await; - } - debug!("wal receiver shutdown confirmed"); - - // Shut down the layer flush task before the remote client, as one depends on the other - task_mgr::shutdown_tasks( - Some(TaskKind::LayerFlushTask), - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - ) - .await; - - // Prevent new uploads from starting. - if let Some(remote_client) = timeline.remote_client.as_ref() { - remote_client.stop(); - } - - // Stop & wait for the remaining timeline tasks, including upload tasks. - // NB: This and other delete_timeline calls do not run as a task_mgr task, - // so, they are not affected by this shutdown_tasks() call. - info!("waiting for timeline tasks to shutdown"); - task_mgr::shutdown_tasks( - None, - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - ) - .await; - - fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { - Err(anyhow::anyhow!( - "failpoint: timeline-delete-before-index-deleted-at" - ))? - }); - - tracing::debug!("Waiting for gate..."); - timeline.gate.close().await; - tracing::debug!("Shutdown complete"); - - Ok(()) -} - /// Mark timeline as deleted in S3 so we won't pick it up next time /// during attach or pageserver restart. /// See comment in persist_index_part_with_deleted_flag. @@ -268,7 +215,14 @@ impl DeleteTimelineFlow { guard.mark_in_progress()?; - stop_tasks(&timeline).await?; + // Now that the Timeline is in Stopping state, request all the related tasks to shut down. + timeline.shutdown(super::ShutdownMode::Hard).await; + + fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-before-index-deleted-at" + ))? + }); set_deleted_in_remote_index(&timeline).await?; diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index ebcd70bd39..304d0d60ee 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -67,20 +67,19 @@ impl Timeline { ), false, async move { - let cancel = task_mgr::shutdown_token(); tokio::select! { - _ = cancel.cancelled() => { return Ok(()); } + _ = self_clone.cancel.cancelled() => { return Ok(()); } _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {} }; - self_clone.eviction_task(parent, cancel).await; + self_clone.eviction_task(parent).await; Ok(()) }, ); } #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] - async fn eviction_task(self: Arc, tenant: Arc, cancel: CancellationToken) { + async fn eviction_task(self: Arc, tenant: Arc) { use crate::tenant::tasks::random_init_delay; // acquire the gate guard only once within a useful span @@ -95,7 +94,7 @@ impl Timeline { EvictionPolicy::OnlyImitiate(lat) => lat.period, EvictionPolicy::NoEviction => Duration::from_secs(10), }; - if random_init_delay(period, &cancel).await.is_err() { + if random_init_delay(period, &self.cancel).await.is_err() { return; } } @@ -104,13 +103,13 @@ impl Timeline { loop { let policy = self.get_eviction_policy(); let cf = self - .eviction_iteration(&tenant, &policy, &cancel, &guard, &ctx) + .eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx) .await; match cf { ControlFlow::Break(()) => break, ControlFlow::Continue(sleep_until) => { - if tokio::time::timeout_at(sleep_until, cancel.cancelled()) + if tokio::time::timeout_at(sleep_until, self.cancel.cancelled()) .await .is_ok() { @@ -379,7 +378,7 @@ impl Timeline { gate: &GateGuard, ctx: &RequestContext, ) -> ControlFlow<()> { - if !self.tenant_shard_id.is_zero() { + if !self.tenant_shard_id.is_shard_zero() { // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size // for consumption metrics (consumption metrics are only sent from shard 0). We may therefore // skip imitating logical size accesses for eviction purposes. diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index d54dc1642c..64edcc5e40 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -120,9 +120,10 @@ impl LayerManager { /// Called from `freeze_inmem_layer`, returns true if successfully frozen. pub(crate) async fn try_freeze_in_memory_layer( &mut self, - Lsn(last_record_lsn): Lsn, + lsn: Lsn, last_freeze_at: &AtomicLsn, ) { + let Lsn(last_record_lsn) = lsn; let end_lsn = Lsn(last_record_lsn + 1); if let Some(open_layer) = &self.layer_map.open_layer { @@ -135,8 +136,11 @@ impl LayerManager { self.layer_map.frozen_layers.push_back(open_layer_rc); self.layer_map.open_layer = None; self.layer_map.next_open_layer_at = Some(end_lsn); - last_freeze_at.store(end_lsn); } + + // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this + // accounts for regions in the LSN range where we might have ingested no data due to sharding. + last_freeze_at.store(end_lsn); } /// Add image layers to the layer map, called from `create_image_layers`. diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index f1b62067f9..a085154a5a 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -24,13 +24,12 @@ mod connection_manager; mod walreceiver_connection; use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME}; +use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::timeline::walreceiver::connection_manager::{ connection_manager_loop_step, ConnectionManagerState, }; -use pageserver_api::shard::TenantShardId; use std::future::Future; use std::num::NonZeroU64; use std::sync::Arc; @@ -40,8 +39,6 @@ use tokio::sync::watch; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::id::TimelineId; - use self::connection_manager::ConnectionManagerStatus; use super::Timeline; @@ -60,9 +57,10 @@ pub struct WalReceiverConf { } pub struct WalReceiver { - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, manager_status: Arc>>, + /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token. + /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`. + cancel: CancellationToken, } impl WalReceiver { @@ -76,23 +74,23 @@ impl WalReceiver { let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); - let loop_status = Arc::new(std::sync::RwLock::new(None)); let manager_status = Arc::clone(&loop_status); - task_mgr::spawn( - WALRECEIVER_RUNTIME.handle(), - TaskKind::WalReceiverManager, - Some(timeline.tenant_shard_id), - Some(timeline_id), - &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"), - false, + let cancel = timeline.cancel.child_token(); + WALRECEIVER_RUNTIME.spawn({ + let cancel = cancel.clone(); async move { debug_assert_current_span_has_tenant_and_timeline_id(); + // acquire timeline gate so we know the task doesn't outlive the Timeline + let Ok(_guard) = timeline.gate.enter() else { + debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already"); + return; + }; debug!("WAL receiver manager started, connecting to broker"); - let cancel = task_mgr::shutdown_token(); let mut connection_manager_state = ConnectionManagerState::new( timeline, conf, + cancel.clone(), ); while !cancel.is_cancelled() { let loop_step_result = connection_manager_loop_step( @@ -112,25 +110,22 @@ impl WalReceiver { } connection_manager_state.shutdown().await; *loop_status.write().unwrap() = None; - Ok(()) + debug!("task exits"); } .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id)) - ); + }); Self { - tenant_shard_id, - timeline_id, manager_status, + cancel, } } - pub async fn stop(self) { - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; + #[instrument(skip_all, level = tracing::Level::DEBUG)] + pub fn cancel(&self) { + debug_assert_current_span_has_tenant_and_timeline_id(); + debug!("cancelling walreceiver tasks"); + self.cancel.cancel(); } pub(crate) fn status(&self) -> Option { @@ -164,14 +159,18 @@ enum TaskStateUpdate { impl TaskHandle { /// Initializes the task, starting it immediately after the creation. + /// + /// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]). + /// It being a child token enables us to provide a [`Self::shutdown`] method. fn spawn( + cancel_parent: &CancellationToken, task: impl FnOnce(watch::Sender>, CancellationToken) -> Fut + Send + 'static, ) -> Self where Fut: Future> + Send, E: Send + Sync + 'static, { - let cancellation = CancellationToken::new(); + let cancellation = cancel_parent.child_token(); let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started); let cancellation_clone = cancellation.clone(); diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 030d24a017..dae31934ad 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -280,6 +280,8 @@ pub(super) struct ConnectionManagerState { id: TenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. timeline: Arc, + /// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn. + cancel: CancellationToken, conf: WalReceiverConf, /// Current connection to safekeeper for WAL streaming. wal_connection: Option, @@ -402,7 +404,11 @@ struct BrokerSkTimeline { } impl ConnectionManagerState { - pub(super) fn new(timeline: Arc, conf: WalReceiverConf) -> Self { + pub(super) fn new( + timeline: Arc, + conf: WalReceiverConf, + cancel: CancellationToken, + ) -> Self { let id = TenantTimelineId { tenant_id: timeline.tenant_shard_id.tenant_id, timeline_id: timeline.timeline_id, @@ -410,6 +416,7 @@ impl ConnectionManagerState { Self { id, timeline, + cancel, conf, wal_connection: None, wal_stream_candidates: HashMap::new(), @@ -417,6 +424,22 @@ impl ConnectionManagerState { } } + fn spawn( + &self, + task: impl FnOnce( + tokio::sync::watch::Sender>, + CancellationToken, + ) -> Fut + + Send + + 'static, + ) -> TaskHandle + where + Fut: std::future::Future> + Send, + { + // TODO: get rid of TaskHandle + super::TaskHandle::spawn(&self.cancel, task) + } + /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) { WALRECEIVER_SWITCHES @@ -435,7 +458,7 @@ impl ConnectionManagerState { ); let span = info_span!("connection", %node_id); - let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { + let connection_handle = self.spawn(move |events_sender, cancellation| { async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -463,6 +486,12 @@ impl ConnectionManagerState { info!("walreceiver connection handling ended: {e}"); Ok(()) } + WalReceiverError::ClosedGate => { + info!( + "walreceiver connection handling ended because of closed gate" + ); + Ok(()) + } WalReceiverError::Other(e) => { // give out an error to have task_mgr give it a really verbose logging if cancellation.is_cancelled() { @@ -1016,7 +1045,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1184,7 +1213,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1251,7 +1280,7 @@ mod tests { sk_id: NodeId(1), availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1315,7 +1344,7 @@ mod tests { sk_id: NodeId(1), availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), + connection_task: state.spawn(move |_, _| async move { Ok(()) }), discovered_new_wal: Some(NewCommittedWAL { discovered_at: time_over_threshold, lsn: new_lsn, @@ -1371,6 +1400,7 @@ mod tests { timeline_id: TIMELINE_ID, }, timeline, + cancel: CancellationToken::new(), conf: WalReceiverConf { wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), @@ -1414,7 +1444,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 00a9dbd760..c6ee6b90c4 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -27,7 +27,6 @@ use super::TaskStateUpdate; use crate::{ context::RequestContext, metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, - task_mgr, task_mgr::TaskKind, task_mgr::WALRECEIVER_RUNTIME, tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, @@ -37,8 +36,8 @@ use crate::{ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::pageserver_feedback::PageserverFeedback; use utils::{id::NodeId, lsn::Lsn}; +use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError}; /// Status of the connection. #[derive(Debug, Clone, Copy)] @@ -68,6 +67,7 @@ pub(super) enum WalReceiverError { SuccessfulCompletion(String), /// Generic error Other(anyhow::Error), + ClosedGate, } impl From for WalReceiverError { @@ -119,6 +119,16 @@ pub(super) async fn handle_walreceiver_connection( ) -> Result<(), WalReceiverError> { debug_assert_current_span_has_tenant_and_timeline_id(); + // prevent timeline shutdown from finishing until we have exited + let _guard = timeline.gate.enter().map_err(|e| match e { + GateError::GateClosed => WalReceiverError::ClosedGate, + })?; + // This function spawns a side-car task (WalReceiverConnectionPoller). + // Get its gate guard now as well. + let poller_guard = timeline.gate.enter().map_err(|e| match e { + GateError::GateClosed => WalReceiverError::ClosedGate, + })?; + WALRECEIVER_STARTED_CONNECTIONS.inc(); // Connect to the database in replication mode. @@ -156,22 +166,19 @@ pub(super) async fn handle_walreceiver_connection( } // The connection object performs the actual communication with the database, - // so spawn it off to run on its own. + // so spawn it off to run on its own. It shouldn't outlive this function, but, + // due to lack of async drop, we can't enforce that. However, we ensure that + // 1. it is sensitive to `cancellation` and + // 2. holds the Timeline gate open so that after timeline shutdown, + // we know this task is gone. let _connection_ctx = ctx.detached_child( TaskKind::WalReceiverConnectionPoller, ctx.download_behavior(), ); let connection_cancellation = cancellation.clone(); - task_mgr::spawn( - WALRECEIVER_RUNTIME.handle(), - TaskKind::WalReceiverConnectionPoller, - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - "walreceiver connection", - false, + WALRECEIVER_RUNTIME.spawn( async move { debug_assert_current_span_has_tenant_and_timeline_id(); - select! { connection_result = connection => match connection_result { Ok(()) => debug!("Walreceiver db connection closed"), @@ -182,6 +189,9 @@ pub(super) async fn handle_walreceiver_connection( // with a similar error. }, WalReceiverError::SuccessfulCompletion(_) => {} + WalReceiverError::ClosedGate => { + // doesn't happen at runtime + } WalReceiverError::Other(err) => { warn!("Connection aborted: {err:#}") } @@ -190,7 +200,7 @@ pub(super) async fn handle_walreceiver_connection( }, _ = connection_cancellation.cancelled() => debug!("Connection cancelled"), } - Ok(()) + drop(poller_guard); } // Enrich the log lines emitted by this closure with meaningful context. // TODO: technically, this task outlives the surrounding function, so, the @@ -303,6 +313,7 @@ pub(super) async fn handle_walreceiver_connection( trace!("received XLogData between {startlsn} and {endlsn}"); + WAL_INGEST.bytes_received.inc_by(data.len() as u64); waldecoder.feed_bytes(data); { @@ -416,7 +427,7 @@ pub(super) async fn handle_walreceiver_connection( // Send the replication feedback message. // Regular standby_status_update fields are put into this message. - let current_timeline_size = if timeline.tenant_shard_id.is_zero() { + let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() { timeline .get_current_logical_size( crate::tenant::timeline::GetLogicalSizePriority::User, diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 805f70b23b..3a6950cf88 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -61,7 +61,7 @@ pub struct VectoredRead { } impl VectoredRead { - fn size(&self) -> usize { + pub fn size(&self) -> usize { (self.end - self.start) as usize } } diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index 830c9897ca..e6c835aa75 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -15,11 +15,23 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result 0 { + statvfs.fragment_size() + } else { + statvfs.block_size() + }; #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] let free = statvfs.blocks_available() as u64 * blocksz; - let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get(); + + #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] + let used = statvfs + .blocks() + // use blocks_free instead of available here to match df in case someone compares + .saturating_sub(statvfs.blocks_free()) as u64 + * blocksz; + let captured_at = std::time::SystemTime::now(); let doc = PageserverUtilization { @@ -29,7 +41,7 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result>, - redo_process: RwLock>>, + /// The current [`process::Process`] that is used by new redo requests. + /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo + /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the + /// their process object; we use [`Arc::clone`] for that. + /// This is primarily because earlier implementations that didn't use [`heavier_once_cell`] + /// had that behavior; it's probably unnecessary. + /// The only merit of it is that if one walredo process encounters an error, + /// it can take it out of rotation (= using [`heavier_once_cell::Guard::take_and_deinit`]. + /// and retry redo, thereby starting the new process, while other redo tasks might + /// still be using the old redo process. But, those other tasks will most likely + /// encounter an error as well, and errors are an unexpected condition anyway. + /// So, probably we could get rid of the `Arc` in the future. + redo_process: heavier_once_cell::OnceCell>, } /// @@ -101,6 +115,7 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, ) + .await }; img = Some(result?); @@ -121,11 +136,12 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, ) + .await } } - pub(crate) fn status(&self) -> Option { - Some(WalRedoManagerStatus { + pub fn status(&self) -> WalRedoManagerStatus { + WalRedoManagerStatus { last_redo_at: { let at = *self.last_redo_at.lock().unwrap(); at.and_then(|at| { @@ -134,8 +150,14 @@ impl PostgresRedoManager { chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?) }) }, - pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()), - }) + process: self + .redo_process + .get() + .map(|p| WalRedoManagerProcessStatus { + pid: p.id(), + kind: std::borrow::Cow::Borrowed(p.kind().into()), + }), + } } } @@ -152,7 +174,7 @@ impl PostgresRedoManager { tenant_shard_id, conf, last_redo_at: std::sync::Mutex::default(), - redo_process: RwLock::new(None), + redo_process: heavier_once_cell::OnceCell::default(), } } @@ -164,8 +186,7 @@ impl PostgresRedoManager { if let Some(last_redo_at) = *g { if last_redo_at.elapsed() >= idle_timeout { drop(g); - let mut guard = self.redo_process.write().unwrap(); - *guard = None; + drop(self.redo_process.get().map(|guard| guard.take_and_deinit())); } } } @@ -174,8 +195,11 @@ impl PostgresRedoManager { /// /// Process one request for WAL redo using wal-redo postgres /// + /// # Cancel-Safety + /// + /// Cancellation safe. #[allow(clippy::too_many_arguments)] - fn apply_batch_postgres( + async fn apply_batch_postgres( &self, key: Key, lsn: Lsn, @@ -191,40 +215,24 @@ impl PostgresRedoManager { const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { - // launch the WAL redo process on first use - let proc: Arc = { - let proc_guard = self.redo_process.read().unwrap(); - match &*proc_guard { - None => { - // "upgrade" to write lock to launch the process - drop(proc_guard); - let mut proc_guard = self.redo_process.write().unwrap(); - match &*proc_guard { - None => { - let start = Instant::now(); - let proc = Arc::new( - process::WalRedoProcess::launch( - self.conf, - self.tenant_shard_id, - pg_version, - ) - .context("launch walredo process")?, - ); - let duration = start.elapsed(); - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM - .observe(duration.as_secs_f64()); - info!( - duration_ms = duration.as_millis(), - pid = proc.id(), - "launched walredo process" - ); - *proc_guard = Some(Arc::clone(&proc)); - proc - } - Some(proc) => Arc::clone(proc), - } - } - Some(proc) => Arc::clone(proc), + let proc: Arc = match self.redo_process.get_or_init_detached().await { + Ok(guard) => Arc::clone(&guard), + Err(permit) => { + // don't hold poison_guard, the launch code can bail + let start = Instant::now(); + let proc = Arc::new( + process::Process::launch(self.conf, self.tenant_shard_id, pg_version) + .context("launch walredo process")?, + ); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + duration_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); + self.redo_process.set(Arc::clone(&proc), permit); + proc } }; @@ -233,6 +241,7 @@ impl PostgresRedoManager { // Relational WAL records are applied using wal-redo-postgres let result = proc .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout) + .await .context("apply_wal_records"); let duration = started_at.elapsed(); @@ -272,34 +281,34 @@ impl PostgresRedoManager { n_attempts, e, ); - // Avoid concurrent callers hitting the same issue. - // We can't prevent it from happening because we want to enable parallelism. - { - let mut guard = self.redo_process.write().unwrap(); - match &*guard { - Some(current_field_value) => { - if Arc::ptr_eq(current_field_value, &proc) { - // We're the first to observe an error from `proc`, it's our job to take it out of rotation. - *guard = None; - } - } - None => { - // Another thread was faster to observe the error, and already took the process out of rotation. - } - } - } + // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation. + // Note that there may be other tasks concurrent with us that also hold `proc`. + // We have to deal with that here. + // Also read the doc comment on field `self.redo_process`. + // // NB: there may still be other concurrent threads using `proc`. // The last one will send SIGKILL when the underlying Arc reaches refcount 0. - // NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep - // holding the lock while waiting for the process to exit. - // NB: the drop impl blocks the current threads with a wait() system call for - // the child process. We dropped the `guard` above so that other threads aren't - // affected. But, it's good that the current thread _does_ block to wait. - // If we instead deferred the waiting into the background / to tokio, it could - // happen that if walredo always fails immediately, we spawn processes faster + // + // NB: the drop impl blocks the dropping thread with a wait() system call for + // the child process. In some ways the blocking is actually good: if we + // deferred the waiting into the background / to tokio if we used `tokio::process`, + // it could happen that if walredo always fails immediately, we spawn processes faster // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here, // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads. // This probably needs revisiting at some later point. + match self.redo_process.get() { + None => (), + Some(guard) => { + if Arc::ptr_eq(&proc, &*guard) { + // We're the first to observe an error from `proc`, it's our job to take it out of rotation. + guard.take_and_deinit(); + } else { + // Another task already spawned another redo process (further up in this method) + // and put it into `redo_process`. Do nothing, our view of the world is behind. + } + } + } + // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall. drop(proc); } else if n_attempts != 0 { info!(n_attempts, "retried walredo succeeded"); diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index bcbb263663..ad6b4e5fe9 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -1,186 +1,67 @@ -use self::no_leak_child::NoLeakChild; -use crate::{ - config::PageServerConf, - metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, - walrecord::NeonWalRecord, -}; -use anyhow::Context; +use std::time::Duration; + use bytes::Bytes; -use nix::poll::{PollFd, PollFlags}; use pageserver_api::{reltag::RelTag, shard::TenantShardId}; -use postgres_ffi::BLCKSZ; -use std::os::fd::AsRawFd; -#[cfg(feature = "testing")] -use std::sync::atomic::AtomicUsize; -use std::{ - collections::VecDeque, - io::{Read, Write}, - process::{ChildStdin, ChildStdout, Command, Stdio}, - sync::{Mutex, MutexGuard}, - time::Duration, -}; -use tracing::{debug, error, instrument, Instrument}; -use utils::{lsn::Lsn, nonblock::set_nonblock}; +use utils::lsn::Lsn; + +use crate::{config::PageServerConf, walrecord::NeonWalRecord}; mod no_leak_child; /// The IPC protocol that pageserver and walredo process speak over their shared pipe. mod protocol; -pub struct WalRedoProcess { - #[allow(dead_code)] - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - // Some() on construction, only becomes None on Drop. - child: Option, - stdout: Mutex, - stdin: Mutex, - /// Counter to separate same sized walredo inputs failing at the same millisecond. - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize, +mod process_impl { + pub(super) mod process_async; + pub(super) mod process_std; } -struct ProcessInput { - stdin: ChildStdin, - n_requests: usize, +#[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + strum_macros::EnumString, + strum_macros::Display, + strum_macros::IntoStaticStr, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +#[repr(u8)] +pub enum Kind { + Sync, + Async, } -struct ProcessOutput { - stdout: ChildStdout, - pending_responses: VecDeque>, - n_processed_responses: usize, +pub(crate) enum Process { + Sync(process_impl::process_std::WalRedoProcess), + Async(process_impl::process_async::WalRedoProcess), } -impl WalRedoProcess { - // - // Start postgres binary in special WAL redo mode. - // - #[instrument(skip_all,fields(pg_version=pg_version))] - pub(crate) fn launch( +impl Process { + #[inline(always)] + pub fn launch( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, pg_version: u32, ) -> anyhow::Result { - crate::span::debug_assert_current_span_has_tenant_id(); - - let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. - let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; - - use no_leak_child::NoLeakChildCommandExt; - // Start postgres itself - let child = Command::new(pg_bin_dir_path.join("postgres")) - // the first arg must be --wal-redo so the child process enters into walredo mode - .arg("--wal-redo") - // the child doesn't process this arg, but, having it in the argv helps indentify the - // walredo process for a particular tenant when debugging a pagserver - .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) - .stdin(Stdio::piped()) - .stderr(Stdio::piped()) - .stdout(Stdio::piped()) - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir_path) - .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) - // NB: The redo process is not trusted after we sent it the first - // walredo work. Before that, it is trusted. Specifically, we trust - // it to - // 1. close all file descriptors except stdin, stdout, stderr because - // pageserver might not be 100% diligent in setting FD_CLOEXEC on all - // the files it opens, and - // 2. to use seccomp to sandbox itself before processing the first - // walredo request. - .spawn_no_leak_child(tenant_shard_id) - .context("spawn process")?; - WAL_REDO_PROCESS_COUNTERS.started.inc(); - let mut child = scopeguard::guard(child, |child| { - error!("killing wal-redo-postgres process due to a problem during launch"); - child.kill_and_wait(WalRedoKillCause::Startup); - }); - - let stdin = child.stdin.take().unwrap(); - let stdout = child.stdout.take().unwrap(); - let stderr = child.stderr.take().unwrap(); - let stderr = tokio::process::ChildStderr::from_std(stderr) - .context("convert to tokio::ChildStderr")?; - macro_rules! set_nonblock_or_log_err { - ($file:ident) => {{ - let res = set_nonblock($file.as_raw_fd()); - if let Err(e) = &res { - error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed"); - } - res - }}; - } - set_nonblock_or_log_err!(stdin)?; - set_nonblock_or_log_err!(stdout)?; - - // all fallible operations post-spawn are complete, so get rid of the guard - let child = scopeguard::ScopeGuard::into_inner(child); - - tokio::spawn( - async move { - scopeguard::defer! { - debug!("wal-redo-postgres stderr_logger_task finished"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); - } - debug!("wal-redo-postgres stderr_logger_task started"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); - - use tokio::io::AsyncBufReadExt; - let mut stderr_lines = tokio::io::BufReader::new(stderr); - let mut buf = Vec::new(); - let res = loop { - buf.clear(); - // TODO we don't trust the process to cap its stderr length. - // Currently it can do unbounded Vec allocation. - match stderr_lines.read_until(b'\n', &mut buf).await { - Ok(0) => break Ok(()), // eof - Ok(num_bytes) => { - let output = String::from_utf8_lossy(&buf[..num_bytes]); - error!(%output, "received output"); - } - Err(e) => { - break Err(e); - } - } - }; - match res { - Ok(()) => (), - Err(e) => { - error!(error=?e, "failed to read from walredo stderr"); - } - } - }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) - ); - - Ok(Self { - conf, - tenant_shard_id, - child: Some(child), - stdin: Mutex::new(ProcessInput { - stdin, - n_requests: 0, - }), - stdout: Mutex::new(ProcessOutput { - stdout, - pending_responses: VecDeque::new(), - n_processed_responses: 0, - }), - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize::default(), + Ok(match conf.walredo_process_kind { + Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch( + conf, + tenant_shard_id, + pg_version, + )?), + Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch( + conf, + tenant_shard_id, + pg_version, + )?), }) } - pub(crate) fn id(&self) -> u32 { - self.child - .as_ref() - .expect("must not call this during Drop") - .id() - } - - // Apply given WAL records ('records') over an old page image. Returns - // new page image. - // - #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] - pub(crate) fn apply_wal_records( + #[inline(always)] + pub(crate) async fn apply_wal_records( &self, rel: RelTag, blknum: u32, @@ -188,221 +69,29 @@ impl WalRedoProcess { records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> anyhow::Result { - let tag = protocol::BufferTag { rel, blknum }; - let input = self.stdin.lock().unwrap(); - - // Serialize all the messages to send the WAL redo process first. - // - // This could be problematic if there are millions of records to replay, - // but in practice the number of records is usually so small that it doesn't - // matter, and it's better to keep this code simple. - // - // Most requests start with a before-image with BLCKSZ bytes, followed by - // by some other WAL records. Start with a buffer that can hold that - // comfortably. - let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); - protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); - if let Some(img) = base_img { - protocol::build_push_page_msg(tag, img, &mut writebuf); - } - for (lsn, rec) in records.iter() { - if let NeonWalRecord::Postgres { - will_init: _, - rec: postgres_rec, - } = rec - { - protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); - } else { - anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); + match self { + Process::Sync(p) => { + p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) + .await } - } - protocol::build_get_page_msg(tag, &mut writebuf); - WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); - - let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout); - - if res.is_err() { - // not all of these can be caused by this particular input, however these are so rare - // in tests so capture all. - self.record_and_log(&writebuf); - } - - res - } - - fn apply_wal_records0( - &self, - writebuf: &[u8], - input: MutexGuard, - wal_redo_timeout: Duration, - ) -> anyhow::Result { - let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small. - let mut nwrite = 0usize; - - while nwrite < writebuf.len() { - let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)]; - let n = loop { - match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) { - Err(nix::errno::Errno::EINTR) => continue, - res => break res, - } - }?; - - if n == 0 { - anyhow::bail!("WAL redo timed out"); + Process::Async(p) => { + p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) + .await } - - // If 'stdin' is writeable, do write. - let in_revents = stdin_pollfds[0].revents().unwrap(); - if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() { - nwrite += proc.stdin.write(&writebuf[nwrite..])?; - } - if in_revents.contains(PollFlags::POLLHUP) { - // We still have more data to write, but the process closed the pipe. - anyhow::bail!("WAL redo process closed its stdin unexpectedly"); - } - } - let request_no = proc.n_requests; - proc.n_requests += 1; - drop(proc); - - // To improve walredo performance we separate sending requests and receiving - // responses. Them are protected by different mutexes (output and input). - // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process - // then there is not warranty that T1 will first granted output mutex lock. - // To address this issue we maintain number of sent requests, number of processed - // responses and ring buffer with pending responses. After sending response - // (under input mutex), threads remembers request number. Then it releases - // input mutex, locks output mutex and fetch in ring buffer all responses until - // its stored request number. The it takes correspondent element from - // pending responses ring buffer and truncate all empty elements from the front, - // advancing processed responses number. - - let mut output = self.stdout.lock().unwrap(); - let n_processed_responses = output.n_processed_responses; - while n_processed_responses + output.pending_responses.len() <= request_no { - // We expect the WAL redo process to respond with an 8k page image. We read it - // into this buffer. - let mut resultbuf = vec![0; BLCKSZ.into()]; - let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far - while nresult < BLCKSZ.into() { - let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)]; - // We do two things simultaneously: reading response from stdout - // and forward any logging information that the child writes to its stderr to the page server's log. - let n = loop { - match nix::poll::poll( - &mut stdout_pollfds[..], - wal_redo_timeout.as_millis() as i32, - ) { - Err(nix::errno::Errno::EINTR) => continue, - res => break res, - } - }?; - - if n == 0 { - anyhow::bail!("WAL redo timed out"); - } - - // If we have some data in stdout, read it to the result buffer. - let out_revents = stdout_pollfds[0].revents().unwrap(); - if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { - nresult += output.stdout.read(&mut resultbuf[nresult..])?; - } - if out_revents.contains(PollFlags::POLLHUP) { - anyhow::bail!("WAL redo process closed its stdout unexpectedly"); - } - } - output - .pending_responses - .push_back(Some(Bytes::from(resultbuf))); - } - // Replace our request's response with None in `pending_responses`. - // Then make space in the ring buffer by clearing out any seqence of contiguous - // `None`'s from the front of `pending_responses`. - // NB: We can't pop_front() because other requests' responses because another - // requester might have grabbed the output mutex before us: - // T1: grab input mutex - // T1: send request_no 23 - // T1: release input mutex - // T2: grab input mutex - // T2: send request_no 24 - // T2: release input mutex - // T2: grab output mutex - // T2: n_processed_responses + output.pending_responses.len() <= request_no - // 23 0 24 - // T2: enters poll loop that reads stdout - // T2: put response for 23 into pending_responses - // T2: put response for 24 into pending_resposnes - // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back - // T2: takes its response_24 - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: releases output mutex - // T1: grabs output mutex - // T1: n_processed_responses + output.pending_responses.len() > request_no - // 23 2 23 - // T1: skips poll loop that reads stdout - // T1: takes its response_23 - // pending_responses now looks like this: Front None None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Back - // n_processed_responses now has value 25 - let res = output.pending_responses[request_no - n_processed_responses] - .take() - .expect("we own this request_no, nobody else is supposed to take it"); - while let Some(front) = output.pending_responses.front() { - if front.is_none() { - output.pending_responses.pop_front(); - output.n_processed_responses += 1; - } else { - break; - } - } - Ok(res) - } - - #[cfg(feature = "testing")] - fn record_and_log(&self, writebuf: &[u8]) { - use std::sync::atomic::Ordering; - - let millis = std::time::SystemTime::now() - .duration_since(std::time::SystemTime::UNIX_EPOCH) - .unwrap() - .as_millis(); - - let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); - - // these files will be collected to an allure report - let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - - let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); - - let res = std::fs::OpenOptions::new() - .write(true) - .create_new(true) - .read(true) - .open(path) - .and_then(|mut f| f.write_all(writebuf)); - - // trip up allowed_errors - if let Err(e) = res { - tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); - } else { - tracing::error!(filename, "erroring walredo input saved"); } } - #[cfg(not(feature = "testing"))] - fn record_and_log(&self, _: &[u8]) {} -} + pub(crate) fn id(&self) -> u32 { + match self { + Process::Sync(p) => p.id(), + Process::Async(p) => p.id(), + } + } -impl Drop for WalRedoProcess { - fn drop(&mut self) { - self.child - .take() - .expect("we only do this once") - .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); - // no way to wait for stderr_logger_task from Drop because that is async only + pub(crate) fn kind(&self) -> Kind { + match self { + Process::Sync(_) => Kind::Sync, + Process::Async(_) => Kind::Async, + } } } diff --git a/pageserver/src/walredo/process/process_impl/process_async.rs b/pageserver/src/walredo/process/process_impl/process_async.rs new file mode 100644 index 0000000000..262858b033 --- /dev/null +++ b/pageserver/src/walredo/process/process_impl/process_async.rs @@ -0,0 +1,374 @@ +use self::no_leak_child::NoLeakChild; +use crate::{ + config::PageServerConf, + metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, + walrecord::NeonWalRecord, + walredo::process::{no_leak_child, protocol}, +}; +use anyhow::Context; +use bytes::Bytes; +use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use postgres_ffi::BLCKSZ; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::{ + collections::VecDeque, + process::{Command, Stdio}, + time::Duration, +}; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tracing::{debug, error, instrument, Instrument}; +use utils::{lsn::Lsn, poison::Poison}; + +pub struct WalRedoProcess { + #[allow(dead_code)] + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + // Some() on construction, only becomes None on Drop. + child: Option, + stdout: tokio::sync::Mutex>, + stdin: tokio::sync::Mutex>, + /// Counter to separate same sized walredo inputs failing at the same millisecond. + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize, +} + +struct ProcessInput { + stdin: tokio::process::ChildStdin, + n_requests: usize, +} + +struct ProcessOutput { + stdout: tokio::process::ChildStdout, + pending_responses: VecDeque>, + n_processed_responses: usize, +} + +impl WalRedoProcess { + // + // Start postgres binary in special WAL redo mode. + // + #[instrument(skip_all,fields(pg_version=pg_version))] + pub(crate) fn launch( + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + pg_version: u32, + ) -> anyhow::Result { + crate::span::debug_assert_current_span_has_tenant_id(); + + let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. + let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; + + use no_leak_child::NoLeakChildCommandExt; + // Start postgres itself + let child = Command::new(pg_bin_dir_path.join("postgres")) + // the first arg must be --wal-redo so the child process enters into walredo mode + .arg("--wal-redo") + // the child doesn't process this arg, but, having it in the argv helps indentify the + // walredo process for a particular tenant when debugging a pagserver + .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) + .stdin(Stdio::piped()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) + // NB: The redo process is not trusted after we sent it the first + // walredo work. Before that, it is trusted. Specifically, we trust + // it to + // 1. close all file descriptors except stdin, stdout, stderr because + // pageserver might not be 100% diligent in setting FD_CLOEXEC on all + // the files it opens, and + // 2. to use seccomp to sandbox itself before processing the first + // walredo request. + .spawn_no_leak_child(tenant_shard_id) + .context("spawn process")?; + WAL_REDO_PROCESS_COUNTERS.started.inc(); + let mut child = scopeguard::guard(child, |child| { + error!("killing wal-redo-postgres process due to a problem during launch"); + child.kill_and_wait(WalRedoKillCause::Startup); + }); + + let stdin = child.stdin.take().unwrap(); + let stdout = child.stdout.take().unwrap(); + let stderr = child.stderr.take().unwrap(); + let stderr = tokio::process::ChildStderr::from_std(stderr) + .context("convert to tokio::ChildStderr")?; + let stdin = + tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?; + let stdout = tokio::process::ChildStdout::from_std(stdout) + .context("convert to tokio::ChildStdout")?; + + // all fallible operations post-spawn are complete, so get rid of the guard + let child = scopeguard::ScopeGuard::into_inner(child); + + tokio::spawn( + async move { + scopeguard::defer! { + debug!("wal-redo-postgres stderr_logger_task finished"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); + } + debug!("wal-redo-postgres stderr_logger_task started"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); + + use tokio::io::AsyncBufReadExt; + let mut stderr_lines = tokio::io::BufReader::new(stderr); + let mut buf = Vec::new(); + let res = loop { + buf.clear(); + // TODO we don't trust the process to cap its stderr length. + // Currently it can do unbounded Vec allocation. + match stderr_lines.read_until(b'\n', &mut buf).await { + Ok(0) => break Ok(()), // eof + Ok(num_bytes) => { + let output = String::from_utf8_lossy(&buf[..num_bytes]); + error!(%output, "received output"); + } + Err(e) => { + break Err(e); + } + } + }; + match res { + Ok(()) => (), + Err(e) => { + error!(error=?e, "failed to read from walredo stderr"); + } + } + }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) + ); + + Ok(Self { + conf, + tenant_shard_id, + child: Some(child), + stdin: tokio::sync::Mutex::new(Poison::new( + "stdin", + ProcessInput { + stdin, + n_requests: 0, + }, + )), + stdout: tokio::sync::Mutex::new(Poison::new( + "stdout", + ProcessOutput { + stdout, + pending_responses: VecDeque::new(), + n_processed_responses: 0, + }, + )), + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize::default(), + }) + } + + pub(crate) fn id(&self) -> u32 { + self.child + .as_ref() + .expect("must not call this during Drop") + .id() + } + + /// Apply given WAL records ('records') over an old page image. Returns + /// new page image. + /// + /// # Cancel-Safety + /// + /// Cancellation safe. + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] + pub(crate) async fn apply_wal_records( + &self, + rel: RelTag, + blknum: u32, + base_img: &Option, + records: &[(Lsn, NeonWalRecord)], + wal_redo_timeout: Duration, + ) -> anyhow::Result { + let tag = protocol::BufferTag { rel, blknum }; + + // Serialize all the messages to send the WAL redo process first. + // + // This could be problematic if there are millions of records to replay, + // but in practice the number of records is usually so small that it doesn't + // matter, and it's better to keep this code simple. + // + // Most requests start with a before-image with BLCKSZ bytes, followed by + // by some other WAL records. Start with a buffer that can hold that + // comfortably. + let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); + protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); + if let Some(img) = base_img { + protocol::build_push_page_msg(tag, img, &mut writebuf); + } + for (lsn, rec) in records.iter() { + if let NeonWalRecord::Postgres { + will_init: _, + rec: postgres_rec, + } = rec + { + protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); + } else { + anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); + } + } + protocol::build_get_page_msg(tag, &mut writebuf); + WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); + + let Ok(res) = + tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await + else { + anyhow::bail!("WAL redo timed out"); + }; + + if res.is_err() { + // not all of these can be caused by this particular input, however these are so rare + // in tests so capture all. + self.record_and_log(&writebuf); + } + + res + } + + /// # Cancel-Safety + /// + /// When not polled to completion (e.g. because in `tokio::select!` another + /// branch becomes ready before this future), concurrent and subsequent + /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls. + /// Dispose of this process instance and create a new one. + async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result { + let request_no = { + let mut lock_guard = self.stdin.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let input = poison_guard.data_mut(); + input + .stdin + .write_all(writebuf) + .await + .context("write to walredo stdin")?; + let request_no = input.n_requests; + input.n_requests += 1; + poison_guard.disarm(); + request_no + }; + + // To improve walredo performance we separate sending requests and receiving + // responses. Them are protected by different mutexes (output and input). + // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process + // then there is not warranty that T1 will first granted output mutex lock. + // To address this issue we maintain number of sent requests, number of processed + // responses and ring buffer with pending responses. After sending response + // (under input mutex), threads remembers request number. Then it releases + // input mutex, locks output mutex and fetch in ring buffer all responses until + // its stored request number. The it takes correspondent element from + // pending responses ring buffer and truncate all empty elements from the front, + // advancing processed responses number. + + let mut lock_guard = self.stdout.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let output = poison_guard.data_mut(); + let n_processed_responses = output.n_processed_responses; + while n_processed_responses + output.pending_responses.len() <= request_no { + // We expect the WAL redo process to respond with an 8k page image. We read it + // into this buffer. + let mut resultbuf = vec![0; BLCKSZ.into()]; + output + .stdout + .read_exact(&mut resultbuf) + .await + .context("read walredo stdout")?; + output + .pending_responses + .push_back(Some(Bytes::from(resultbuf))); + } + // Replace our request's response with None in `pending_responses`. + // Then make space in the ring buffer by clearing out any seqence of contiguous + // `None`'s from the front of `pending_responses`. + // NB: We can't pop_front() because other requests' responses because another + // requester might have grabbed the output mutex before us: + // T1: grab input mutex + // T1: send request_no 23 + // T1: release input mutex + // T2: grab input mutex + // T2: send request_no 24 + // T2: release input mutex + // T2: grab output mutex + // T2: n_processed_responses + output.pending_responses.len() <= request_no + // 23 0 24 + // T2: enters poll loop that reads stdout + // T2: put response for 23 into pending_responses + // T2: put response for 24 into pending_resposnes + // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back + // T2: takes its response_24 + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: releases output mutex + // T1: grabs output mutex + // T1: n_processed_responses + output.pending_responses.len() > request_no + // 23 2 23 + // T1: skips poll loop that reads stdout + // T1: takes its response_23 + // pending_responses now looks like this: Front None None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Back + // n_processed_responses now has value 25 + let res = output.pending_responses[request_no - n_processed_responses] + .take() + .expect("we own this request_no, nobody else is supposed to take it"); + while let Some(front) = output.pending_responses.front() { + if front.is_none() { + output.pending_responses.pop_front(); + output.n_processed_responses += 1; + } else { + break; + } + } + poison_guard.disarm(); + Ok(res) + } + + #[cfg(feature = "testing")] + fn record_and_log(&self, writebuf: &[u8]) { + use std::sync::atomic::Ordering; + + let millis = std::time::SystemTime::now() + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_millis(); + + let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); + + // these files will be collected to an allure report + let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); + + let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); + + use std::io::Write; + let res = std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .read(true) + .open(path) + .and_then(|mut f| f.write_all(writebuf)); + + // trip up allowed_errors + if let Err(e) = res { + tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); + } else { + tracing::error!(filename, "erroring walredo input saved"); + } + } + + #[cfg(not(feature = "testing"))] + fn record_and_log(&self, _: &[u8]) {} +} + +impl Drop for WalRedoProcess { + fn drop(&mut self) { + self.child + .take() + .expect("we only do this once") + .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); + // no way to wait for stderr_logger_task from Drop because that is async only + } +} diff --git a/pageserver/src/walredo/process/process_impl/process_std.rs b/pageserver/src/walredo/process/process_impl/process_std.rs new file mode 100644 index 0000000000..e7a6c263c9 --- /dev/null +++ b/pageserver/src/walredo/process/process_impl/process_std.rs @@ -0,0 +1,405 @@ +use self::no_leak_child::NoLeakChild; +use crate::{ + config::PageServerConf, + metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, + walrecord::NeonWalRecord, + walredo::process::{no_leak_child, protocol}, +}; +use anyhow::Context; +use bytes::Bytes; +use nix::poll::{PollFd, PollFlags}; +use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use postgres_ffi::BLCKSZ; +use std::os::fd::AsRawFd; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::{ + collections::VecDeque, + io::{Read, Write}, + process::{ChildStdin, ChildStdout, Command, Stdio}, + sync::{Mutex, MutexGuard}, + time::Duration, +}; +use tracing::{debug, error, instrument, Instrument}; +use utils::{lsn::Lsn, nonblock::set_nonblock}; + +pub struct WalRedoProcess { + #[allow(dead_code)] + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + // Some() on construction, only becomes None on Drop. + child: Option, + stdout: Mutex, + stdin: Mutex, + /// Counter to separate same sized walredo inputs failing at the same millisecond. + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize, +} + +struct ProcessInput { + stdin: ChildStdin, + n_requests: usize, +} + +struct ProcessOutput { + stdout: ChildStdout, + pending_responses: VecDeque>, + n_processed_responses: usize, +} + +impl WalRedoProcess { + // + // Start postgres binary in special WAL redo mode. + // + #[instrument(skip_all,fields(pg_version=pg_version))] + pub(crate) fn launch( + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + pg_version: u32, + ) -> anyhow::Result { + crate::span::debug_assert_current_span_has_tenant_id(); + + let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. + let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; + + use no_leak_child::NoLeakChildCommandExt; + // Start postgres itself + let child = Command::new(pg_bin_dir_path.join("postgres")) + // the first arg must be --wal-redo so the child process enters into walredo mode + .arg("--wal-redo") + // the child doesn't process this arg, but, having it in the argv helps indentify the + // walredo process for a particular tenant when debugging a pagserver + .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) + .stdin(Stdio::piped()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) + // NB: The redo process is not trusted after we sent it the first + // walredo work. Before that, it is trusted. Specifically, we trust + // it to + // 1. close all file descriptors except stdin, stdout, stderr because + // pageserver might not be 100% diligent in setting FD_CLOEXEC on all + // the files it opens, and + // 2. to use seccomp to sandbox itself before processing the first + // walredo request. + .spawn_no_leak_child(tenant_shard_id) + .context("spawn process")?; + WAL_REDO_PROCESS_COUNTERS.started.inc(); + let mut child = scopeguard::guard(child, |child| { + error!("killing wal-redo-postgres process due to a problem during launch"); + child.kill_and_wait(WalRedoKillCause::Startup); + }); + + let stdin = child.stdin.take().unwrap(); + let stdout = child.stdout.take().unwrap(); + let stderr = child.stderr.take().unwrap(); + let stderr = tokio::process::ChildStderr::from_std(stderr) + .context("convert to tokio::ChildStderr")?; + macro_rules! set_nonblock_or_log_err { + ($file:ident) => {{ + let res = set_nonblock($file.as_raw_fd()); + if let Err(e) = &res { + error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed"); + } + res + }}; + } + set_nonblock_or_log_err!(stdin)?; + set_nonblock_or_log_err!(stdout)?; + + // all fallible operations post-spawn are complete, so get rid of the guard + let child = scopeguard::ScopeGuard::into_inner(child); + + tokio::spawn( + async move { + scopeguard::defer! { + debug!("wal-redo-postgres stderr_logger_task finished"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); + } + debug!("wal-redo-postgres stderr_logger_task started"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); + + use tokio::io::AsyncBufReadExt; + let mut stderr_lines = tokio::io::BufReader::new(stderr); + let mut buf = Vec::new(); + let res = loop { + buf.clear(); + // TODO we don't trust the process to cap its stderr length. + // Currently it can do unbounded Vec allocation. + match stderr_lines.read_until(b'\n', &mut buf).await { + Ok(0) => break Ok(()), // eof + Ok(num_bytes) => { + let output = String::from_utf8_lossy(&buf[..num_bytes]); + error!(%output, "received output"); + } + Err(e) => { + break Err(e); + } + } + }; + match res { + Ok(()) => (), + Err(e) => { + error!(error=?e, "failed to read from walredo stderr"); + } + } + }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) + ); + + Ok(Self { + conf, + tenant_shard_id, + child: Some(child), + stdin: Mutex::new(ProcessInput { + stdin, + n_requests: 0, + }), + stdout: Mutex::new(ProcessOutput { + stdout, + pending_responses: VecDeque::new(), + n_processed_responses: 0, + }), + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize::default(), + }) + } + + pub(crate) fn id(&self) -> u32 { + self.child + .as_ref() + .expect("must not call this during Drop") + .id() + } + + // Apply given WAL records ('records') over an old page image. Returns + // new page image. + // + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] + pub(crate) async fn apply_wal_records( + &self, + rel: RelTag, + blknum: u32, + base_img: &Option, + records: &[(Lsn, NeonWalRecord)], + wal_redo_timeout: Duration, + ) -> anyhow::Result { + let tag = protocol::BufferTag { rel, blknum }; + let input = self.stdin.lock().unwrap(); + + // Serialize all the messages to send the WAL redo process first. + // + // This could be problematic if there are millions of records to replay, + // but in practice the number of records is usually so small that it doesn't + // matter, and it's better to keep this code simple. + // + // Most requests start with a before-image with BLCKSZ bytes, followed by + // by some other WAL records. Start with a buffer that can hold that + // comfortably. + let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); + protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); + if let Some(img) = base_img { + protocol::build_push_page_msg(tag, img, &mut writebuf); + } + for (lsn, rec) in records.iter() { + if let NeonWalRecord::Postgres { + will_init: _, + rec: postgres_rec, + } = rec + { + protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); + } else { + anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); + } + } + protocol::build_get_page_msg(tag, &mut writebuf); + WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); + + let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout); + + if res.is_err() { + // not all of these can be caused by this particular input, however these are so rare + // in tests so capture all. + self.record_and_log(&writebuf); + } + + res + } + + fn apply_wal_records0( + &self, + writebuf: &[u8], + input: MutexGuard, + wal_redo_timeout: Duration, + ) -> anyhow::Result { + let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small. + let mut nwrite = 0usize; + + while nwrite < writebuf.len() { + let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)]; + let n = loop { + match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) { + Err(nix::errno::Errno::EINTR) => continue, + res => break res, + } + }?; + + if n == 0 { + anyhow::bail!("WAL redo timed out"); + } + + // If 'stdin' is writeable, do write. + let in_revents = stdin_pollfds[0].revents().unwrap(); + if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() { + nwrite += proc.stdin.write(&writebuf[nwrite..])?; + } + if in_revents.contains(PollFlags::POLLHUP) { + // We still have more data to write, but the process closed the pipe. + anyhow::bail!("WAL redo process closed its stdin unexpectedly"); + } + } + let request_no = proc.n_requests; + proc.n_requests += 1; + drop(proc); + + // To improve walredo performance we separate sending requests and receiving + // responses. Them are protected by different mutexes (output and input). + // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process + // then there is not warranty that T1 will first granted output mutex lock. + // To address this issue we maintain number of sent requests, number of processed + // responses and ring buffer with pending responses. After sending response + // (under input mutex), threads remembers request number. Then it releases + // input mutex, locks output mutex and fetch in ring buffer all responses until + // its stored request number. The it takes correspondent element from + // pending responses ring buffer and truncate all empty elements from the front, + // advancing processed responses number. + + let mut output = self.stdout.lock().unwrap(); + let n_processed_responses = output.n_processed_responses; + while n_processed_responses + output.pending_responses.len() <= request_no { + // We expect the WAL redo process to respond with an 8k page image. We read it + // into this buffer. + let mut resultbuf = vec![0; BLCKSZ.into()]; + let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far + while nresult < BLCKSZ.into() { + let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)]; + // We do two things simultaneously: reading response from stdout + // and forward any logging information that the child writes to its stderr to the page server's log. + let n = loop { + match nix::poll::poll( + &mut stdout_pollfds[..], + wal_redo_timeout.as_millis() as i32, + ) { + Err(nix::errno::Errno::EINTR) => continue, + res => break res, + } + }?; + + if n == 0 { + anyhow::bail!("WAL redo timed out"); + } + + // If we have some data in stdout, read it to the result buffer. + let out_revents = stdout_pollfds[0].revents().unwrap(); + if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { + nresult += output.stdout.read(&mut resultbuf[nresult..])?; + } + if out_revents.contains(PollFlags::POLLHUP) { + anyhow::bail!("WAL redo process closed its stdout unexpectedly"); + } + } + output + .pending_responses + .push_back(Some(Bytes::from(resultbuf))); + } + // Replace our request's response with None in `pending_responses`. + // Then make space in the ring buffer by clearing out any seqence of contiguous + // `None`'s from the front of `pending_responses`. + // NB: We can't pop_front() because other requests' responses because another + // requester might have grabbed the output mutex before us: + // T1: grab input mutex + // T1: send request_no 23 + // T1: release input mutex + // T2: grab input mutex + // T2: send request_no 24 + // T2: release input mutex + // T2: grab output mutex + // T2: n_processed_responses + output.pending_responses.len() <= request_no + // 23 0 24 + // T2: enters poll loop that reads stdout + // T2: put response for 23 into pending_responses + // T2: put response for 24 into pending_resposnes + // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back + // T2: takes its response_24 + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: releases output mutex + // T1: grabs output mutex + // T1: n_processed_responses + output.pending_responses.len() > request_no + // 23 2 23 + // T1: skips poll loop that reads stdout + // T1: takes its response_23 + // pending_responses now looks like this: Front None None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Back + // n_processed_responses now has value 25 + let res = output.pending_responses[request_no - n_processed_responses] + .take() + .expect("we own this request_no, nobody else is supposed to take it"); + while let Some(front) = output.pending_responses.front() { + if front.is_none() { + output.pending_responses.pop_front(); + output.n_processed_responses += 1; + } else { + break; + } + } + Ok(res) + } + + #[cfg(feature = "testing")] + fn record_and_log(&self, writebuf: &[u8]) { + use std::sync::atomic::Ordering; + + let millis = std::time::SystemTime::now() + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_millis(); + + let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); + + // these files will be collected to an allure report + let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); + + let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); + + let res = std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .read(true) + .open(path) + .and_then(|mut f| f.write_all(writebuf)); + + // trip up allowed_errors + if let Err(e) = res { + tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); + } else { + tracing::error!(filename, "erroring walredo input saved"); + } + } + + #[cfg(not(feature = "testing"))] + fn record_and_log(&self, _: &[u8]) {} +} + +impl Drop for WalRedoProcess { + fn drop(&mut self) { + self.child + .take() + .expect("we only do this once") + .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); + // no way to wait for stderr_logger_task from Drop because that is async only + } +} diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 1bc8a2e87c..2276b4e807 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -495,16 +495,17 @@ retry: static void pageserver_disconnect(shardno_t shard_no) { - if (page_servers[shard_no].conn) - { - /* - * If the connection to any pageserver is lost, we throw away the - * whole prefetch queue, even for other pageservers. It should not - * cause big problems, because connection loss is supposed to be a - * rare event. - */ - prefetch_on_ps_disconnect(); - } + /* + * If the connection to any pageserver is lost, we throw away the + * whole prefetch queue, even for other pageservers. It should not + * cause big problems, because connection loss is supposed to be a + * rare event. + * + * Prefetch state should be reset even if page_servers[shard_no].conn == NULL, + * because prefetch request may be registered before connection is established. + */ + prefetch_on_ps_disconnect(); + pageserver_disconnect_shard(shard_no); } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index ecc8ddb384..57a16e00ca 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -641,13 +641,12 @@ prefetch_on_ps_disconnect(void) static inline void prefetch_set_unused(uint64 ring_index) { - PrefetchRequest *slot = GetPrfSlot(ring_index); + PrefetchRequest *slot; if (ring_index < MyPState->ring_last) return; /* Should already be unused */ - Assert(MyPState->ring_unused > ring_index); - + slot = GetPrfSlot(ring_index); if (slot->status == PRFS_UNUSED) return; @@ -806,7 +805,8 @@ Retry: { if (*force_lsn > slot->effective_request_lsn) { - prefetch_wait_for(ring_index); + if (!prefetch_wait_for(ring_index)) + goto Retry; prefetch_set_unused(ring_index); entry = NULL; } @@ -821,7 +821,8 @@ Retry: { if (*force_lsn != slot->effective_request_lsn) { - prefetch_wait_for(ring_index); + if (!prefetch_wait_for(ring_index)) + goto Retry; prefetch_set_unused(ring_index); entry = NULL; } @@ -887,7 +888,8 @@ Retry: { case PRFS_REQUESTED: Assert(MyPState->ring_receive == cleanup_index); - prefetch_wait_for(cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; prefetch_set_unused(cleanup_index); break; case PRFS_RECEIVED: @@ -1688,7 +1690,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag); } pfree(resp); return exists; @@ -2140,6 +2142,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, /* * Try to find prefetched page in the list of received pages. */ + Retry: entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); if (entry != NULL) @@ -2161,7 +2164,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, */ if (slot->status == PRFS_REQUESTED) { - prefetch_wait_for(slot->my_ring_index); + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; } /* drop caches */ prefetch_set_unused(slot->my_ring_index); @@ -2224,7 +2228,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, ((NeonErrorResponse *) resp)->message))); break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag); } /* buffer was used, clean up for later reuse */ @@ -2497,7 +2501,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag); } update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); @@ -2552,7 +2556,7 @@ neon_dbsize(Oid dbNode) break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag); } neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", @@ -2857,7 +2861,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag); } pfree(resp); diff --git a/poetry.lock b/poetry.lock index 7b49daf42a..aca88073a8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "aiohttp" @@ -1191,13 +1191,13 @@ files = [ [[package]] name = "idna" -version = "3.3" +version = "3.7" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" files = [ - {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, - {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, + {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, + {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] [[package]] @@ -2182,6 +2182,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2652,6 +2653,16 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, + {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, + {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, + {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, + {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 57a2736d5b..6b8f2ecbf4 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -10,7 +10,9 @@ testing = [] [dependencies] anyhow.workspace = true +async-compression.workspace = true async-trait.workspace = true +atomic-take.workspace = true aws-config.workspace = true aws-sdk-iam.workspace = true aws-sigv4.workspace = true @@ -35,10 +37,14 @@ http.workspace = true humantime.workspace = true hyper-tungstenite.workspace = true hyper.workspace = true +hyper1 = { package = "hyper", version = "1.2", features = ["server"] } +hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } +http-body-util = { version = "0.1" } ipnet.workspace = true itertools.workspace = true lasso = { workspace = true, features = ["multi-threaded"] } md5.workspace = true +measured = { workspace = true, features = ["lasso"] } metrics.workspace = true once_cell.workspace = true opentelemetry.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index e421798067..3795e3b608 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -2,8 +2,15 @@ mod classic; mod hacks; mod link; +use std::net::IpAddr; +use std::sync::Arc; +use std::time::Duration; + +use ipnet::{Ipv4Net, Ipv6Net}; pub use link::LinkAuthError; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio_postgres::config::AuthKeys; +use tracing::{info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; use crate::auth::validate_password_and_exchange; @@ -13,9 +20,10 @@ use crate::console::provider::{CachedRoleSecret, ConsoleBackend}; use crate::console::{AuthSecret, NodeInfo}; use crate::context::RequestMonitoring; use crate::intern::EndpointIdInt; -use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED}; +use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; +use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo}; use crate::stream::Stream; use crate::{ auth::{self, ComputeUserInfoMaybeEndpoint}, @@ -27,10 +35,7 @@ use crate::{ }, stream, url, }; -use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; -use std::sync::Arc; -use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{info, warn}; +use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName}; /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality pub enum MaybeOwned<'a, T> { @@ -176,17 +181,51 @@ impl TryFrom for ComputeUserInfo { } } +#[derive(PartialEq, PartialOrd, Hash, Eq, Ord, Debug, Copy, Clone)] +pub struct MaskedIp(IpAddr); + +impl MaskedIp { + fn new(value: IpAddr, prefix: u8) -> Self { + match value { + IpAddr::V4(v4) => Self(IpAddr::V4( + Ipv4Net::new(v4, prefix).map_or(v4, |x| x.trunc().addr()), + )), + IpAddr::V6(v6) => Self(IpAddr::V6( + Ipv6Net::new(v6, prefix).map_or(v6, |x| x.trunc().addr()), + )), + } + } +} + +// This can't be just per IP because that would limit some PaaS that share IP addresses +pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>; + +impl RateBucketInfo { + /// All of these are per endpoint-maskedip pair. + /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). + /// + /// First bucket: 1000mcpus total per endpoint-ip pair + /// * 4096000 requests per second with 1 hash rounds. + /// * 1000 requests per second with 4096 hash rounds. + /// * 6.8 requests per second with 600000 hash rounds. + pub const DEFAULT_AUTH_SET: [Self; 3] = [ + Self::new(1000 * 4096, Duration::from_secs(1)), + Self::new(600 * 4096, Duration::from_secs(60)), + Self::new(300 * 4096, Duration::from_secs(600)), + ]; +} + impl AuthenticationConfig { pub fn check_rate_limit( &self, - ctx: &mut RequestMonitoring, + config: &AuthenticationConfig, secret: AuthSecret, endpoint: &EndpointId, is_cleartext: bool, ) -> auth::Result { // we have validated the endpoint exists, so let's intern it. - let endpoint_int = EndpointIdInt::from(endpoint); + let endpoint_int = EndpointIdInt::from(endpoint.normalize()); // only count the full hash count if password hack or websocket flow. // in other words, if proxy needs to run the hashing @@ -201,17 +240,25 @@ impl AuthenticationConfig { 1 }; - let limit_not_exceeded = self - .rate_limiter - .check((endpoint_int, ctx.peer_addr), password_weight); + let limit_not_exceeded = self.rate_limiter.check( + ( + endpoint_int, + MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet), + ), + password_weight, + ); if !limit_not_exceeded { warn!( enabled = self.rate_limiter_enabled, "rate limiting authentication" ); - AUTH_RATE_LIMIT_HITS.inc(); - ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint); + Metrics::get().proxy.requests_auth_rate_limits_total.inc(); + Metrics::get() + .proxy + .endpoints_auth_rate_limits + .get_metric() + .measure(endpoint); if self.rate_limiter_enabled { return Err(auth::AuthError::too_many_connections()); @@ -267,6 +314,7 @@ async fn auth_quirks( let secret = match secret { Some(secret) => config.check_rate_limit( ctx, + config, secret, &info.endpoint, unauthenticated_password.is_some() || allow_cleartext, @@ -469,7 +517,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> { #[cfg(test)] mod tests { - use std::sync::Arc; + use std::{net::IpAddr, sync::Arc, time::Duration}; use bytes::BytesMut; use fallible_iterator::FallibleIterator; @@ -482,7 +530,7 @@ mod tests { use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; use crate::{ - auth::{ComputeUserInfoMaybeEndpoint, IpPattern}, + auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern}, config::AuthenticationConfig, console::{ self, @@ -491,12 +539,12 @@ mod tests { }, context::RequestMonitoring, proxy::NeonOptions, - rate_limiter::{AuthRateLimiter, RateBucketInfo}, + rate_limiter::RateBucketInfo, scram::ServerSecret, stream::{PqStream, Stream}, }; - use super::auth_quirks; + use super::{auth_quirks, AuthRateLimiter}; struct Auth { ips: Vec, @@ -537,6 +585,7 @@ mod tests { scram_protocol_timeout: std::time::Duration::from_secs(5), rate_limiter_enabled: true, rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), + rate_limit_ip_subnet: 64, }); async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage { @@ -548,6 +597,51 @@ mod tests { } } + #[test] + fn masked_ip() { + let ip_a = IpAddr::V4([127, 0, 0, 1].into()); + let ip_b = IpAddr::V4([127, 0, 0, 2].into()); + let ip_c = IpAddr::V4([192, 168, 1, 101].into()); + let ip_d = IpAddr::V4([192, 168, 1, 102].into()); + let ip_e = IpAddr::V6("abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd".parse().unwrap()); + let ip_f = IpAddr::V6("abcd:abcd:abcd:abcd:1234:abcd:abcd:abcd".parse().unwrap()); + + assert_ne!(MaskedIp::new(ip_a, 64), MaskedIp::new(ip_b, 64)); + assert_ne!(MaskedIp::new(ip_a, 32), MaskedIp::new(ip_b, 32)); + assert_eq!(MaskedIp::new(ip_a, 30), MaskedIp::new(ip_b, 30)); + assert_eq!(MaskedIp::new(ip_c, 30), MaskedIp::new(ip_d, 30)); + + assert_ne!(MaskedIp::new(ip_e, 128), MaskedIp::new(ip_f, 128)); + assert_eq!(MaskedIp::new(ip_e, 64), MaskedIp::new(ip_f, 64)); + } + + #[test] + fn test_default_auth_rate_limit_set() { + // these values used to exceed u32::MAX + assert_eq!( + RateBucketInfo::DEFAULT_AUTH_SET, + [ + RateBucketInfo { + interval: Duration::from_secs(1), + max_rpi: 1000 * 4096, + }, + RateBucketInfo { + interval: Duration::from_secs(60), + max_rpi: 600 * 4096 * 60, + }, + RateBucketInfo { + interval: Duration::from_secs(600), + max_rpi: 300 * 4096 * 600, + } + ] + ); + + for x in RateBucketInfo::DEFAULT_AUTH_SET { + let y = x.to_string().parse().unwrap(); + assert_eq!(x, y); + } + } + #[tokio::test] async fn auth_quirks_scram() { let (mut client, server) = tokio::io::duplex(1024); diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 7db76f3d9e..415a4b7d85 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -102,8 +102,7 @@ pub(super) async fn authenticate( ctx.set_user(db_info.user.into()); ctx.set_project(db_info.aux.clone()); - let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default(); - info!(?cold_start_info, "woken up a compute node"); + info!("woken up a compute node"); // Backwards compatibility. pg_sni_proxy uses "--" in domain names // while direct connections do not. Once we migrate to pg_sni_proxy diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 89773aa1ff..783a1a5a21 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -4,7 +4,7 @@ use crate::{ auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::{ReportableError, UserFacingError}, - metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, + metrics::{Metrics, SniKind}, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI, EndpointId, RoleName, @@ -144,21 +144,22 @@ impl ComputeUserInfoMaybeEndpoint { ctx.set_endpoint_id(ep.clone()); } + let metrics = Metrics::get(); info!(%user, "credentials"); if sni.is_some() { info!("Connection with sni"); - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["sni"]) - .inc(); + metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); } else if endpoint.is_some() { - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["no_sni"]) - .inc(); + metrics + .proxy + .accepted_connections_by_sni + .inc(SniKind::NoSni); info!("Connection without sni"); } else { - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["password_hack"]) - .inc(); + metrics + .proxy + .accepted_connections_by_sni + .inc(SniKind::PasswordHack); info!("Connection with password hack"); } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 385f7820cb..7a693002a8 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -9,14 +9,13 @@ use futures::future::Either; use itertools::Itertools; use proxy::config::TlsServerEndPoint; use proxy::context::RequestMonitoring; -use proxy::proxy::run_until_cancelled; +use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled}; use rustls::pki_types::PrivateKeyDer; use tokio::net::TcpListener; use anyhow::{anyhow, bail, ensure, Context}; use clap::Arg; use futures::TryFutureExt; -use proxy::console::messages::MetricsAuxInfo; use proxy::stream::{PqStream, Stream}; use tokio::io::{AsyncRead, AsyncWrite}; @@ -175,7 +174,12 @@ async fn task_main( .context("failed to set socket option")?; info!(%peer_addr, "serving"); - let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni"); + let ctx = RequestMonitoring::new( + session_id, + peer_addr.ip(), + proxy::metrics::Protocol::SniRouter, + "sni", + ); handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await } .unwrap_or_else(|e| { @@ -198,6 +202,7 @@ async fn task_main( const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; async fn ssl_handshake( + ctx: &mut RequestMonitoring, raw_stream: S, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, @@ -227,7 +232,10 @@ async fn ssl_handshake( } Ok(Stream::Tls { - tls: Box::new(raw.upgrade(tls_config).await?), + tls: Box::new( + raw.upgrade(tls_config, !ctx.has_private_peer_addr()) + .await?, + ), tls_server_end_point, }) } @@ -250,7 +258,7 @@ async fn handle_client( tls_server_end_point: TlsServerEndPoint, stream: impl AsyncRead + AsyncWrite + Unpin, ) -> anyhow::Result<()> { - let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?; + let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?; // Cut off first part of the SNI domain // We receive required destination details in the format of @@ -267,13 +275,15 @@ async fn handle_client( info!("destination: {}", destination); - let client = tokio::net::TcpStream::connect(destination).await?; - - let metrics_aux: MetricsAuxInfo = Default::default(); + let mut client = tokio::net::TcpStream::connect(destination).await?; // doesn't yet matter as pg-sni-router doesn't report analytics logs ctx.set_success(); ctx.log(); - proxy::proxy::passthrough::proxy_pass(tls_stream, client, metrics_aux).await + // Starting from here we only proxy the client's traffic. + info!("performing the proxy pass..."); + let _ = copy_bidirectional_client_compute(&mut tls_stream, &mut client).await?; + + Ok(()) } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 88b847f5f1..71283dd606 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -7,9 +7,11 @@ use aws_config::provider_config::ProviderConfig; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; use futures::future::Either; use proxy::auth; +use proxy::auth::backend::AuthRateLimiter; use proxy::auth::backend::MaybeOwned; use proxy::cancellation::CancelMap; use proxy::cancellation::CancellationHandler; +use proxy::config::remote_storage_from_toml; use proxy::config::AuthenticationConfig; use proxy::config::CacheOptions; use proxy::config::HttpConfig; @@ -17,11 +19,10 @@ use proxy::config::ProjectInfoCacheOptions; use proxy::console; use proxy::context::parquet::ParquetUploadArgs; use proxy::http; -use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT; -use proxy::rate_limiter::AuthRateLimiter; +use proxy::http::health_server::AppMetrics; +use proxy::metrics::Metrics; use proxy::rate_limiter::EndpointRateLimiter; use proxy::rate_limiter::RateBucketInfo; -use proxy::rate_limiter::RateLimiterConfig; use proxy::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use proxy::redis::elasticache; @@ -41,6 +42,7 @@ use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::info; use tracing::warn; +use tracing::Instrument; use utils::{project_build_tag, project_git_version, sentry_init::init_sentry}; project_git_version!(GIT_VERSION); @@ -130,14 +132,8 @@ struct ProxyCliArgs { #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] require_client_ip: bool, /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour. - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] disable_dynamic_rate_limiter: bool, - /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`. - #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)] - rate_limit_algorithm: proxy::rate_limiter::RateLimitAlgorithm, - /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error. - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - rate_limiter_timeout: tokio::time::Duration, /// Endpoint rate limiter max number of requests per second. /// /// Provided in the form '@'. @@ -150,14 +146,12 @@ struct ProxyCliArgs { /// Authentication rate limiter max number of hashes per second. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] auth_rate_limit: Vec, + /// The IP subnet to use when considering whether two IP addresses are considered the same. + #[clap(long, default_value_t = 64)] + auth_rate_limit_ip_subnet: u8, /// Redis rate limiter max number of requests per second. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] redis_rps_limit: Vec, - /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`. - #[clap(long, default_value_t = 100)] - initial_limit: usize, - #[clap(flatten)] - aimd_config: proxy::rate_limiter::AimdConfig, /// cache for `allowed_ips` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] allowed_ips_cache: String, @@ -188,9 +182,24 @@ struct ProxyCliArgs { /// cache for `project_info` (use `size=0` to disable) #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] project_info_cache: String, - + /// cache for all valid endpoints + #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] + endpoint_cache_config: String, #[clap(flatten)] parquet_upload: ParquetUploadArgs, + + /// interval for backup metric collection + #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] + metric_backup_collection_interval: std::time::Duration, + /// remote storage configuration for backup metric collection + /// Encoded as toml (same format as pageservers), eg + /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` + #[clap(long, default_value = "{}")] + metric_backup_collection_remote_storage: String, + /// chunk size for backup metric collection + /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. + #[clap(long, default_value = "4194304")] + metric_backup_collection_chunk_size: usize, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -235,14 +244,18 @@ async fn main() -> anyhow::Result<()> { info!("Version: {GIT_VERSION}"); info!("Build_tag: {BUILD_TAG}"); - ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); - match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) { - Ok(t) => { - t.start(); + let jemalloc = match proxy::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None } - Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"), - } + }; let args = ProxyCliArgs::parse(); let config = build_config(&args)?; @@ -282,27 +295,27 @@ async fn main() -> anyhow::Result<()> { ), aws_credentials_provider, )); - let redis_notifications_client = - match (args.redis_notifications, (args.redis_host, args.redis_port)) { - (Some(url), _) => { - info!("Starting redis notifications listener ({url})"); - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) - } - (None, (Some(host), Some(port))) => Some( - ConnectionWithCredentialsProvider::new_with_credentials_provider( - host, - port, - elasticache_credentials_provider.clone(), - ), + let regional_redis_client = match (args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host, + port, + elasticache_credentials_provider.clone(), ), - (None, (None, None)) => { - warn!("Redis is disabled"); - None - } - _ => { - bail!("redis-host and redis-port must be specified together"); - } - }; + ), + (None, None) => { + warn!("Redis events from console are disabled"); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }; + let redis_notifications_client = if let Some(url) = args.redis_notifications { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) + } else { + regional_redis_client.clone() + }; // Check that we can bind to address before further initialization let http_address: SocketAddr = args.http.parse()?; @@ -321,8 +334,7 @@ async fn main() -> anyhow::Result<()> { let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit)); let cancel_map = CancelMap::default(); - // let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x))); - let redis_publisher = match &redis_notifications_client { + let redis_publisher = match ®ional_redis_client { Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( redis_publisher.clone(), args.region.clone(), @@ -335,7 +347,7 @@ async fn main() -> anyhow::Result<()> { >::new( cancel_map.clone(), redis_publisher, - NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT, + proxy::metrics::CancellationSource::FromClient, )); // client facing tasks. these will exit on error or on cancellation @@ -372,12 +384,24 @@ async fn main() -> anyhow::Result<()> { // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(proxy::handle_signals(cancellation_token)); - maintenance_tasks.spawn(http::health_server::task_main(http_listener)); + maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone())); + maintenance_tasks.spawn(http::health_server::task_main( + http_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: proxy::metrics::Metrics::get(), + }, + )); maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener)); if let Some(metrics_config) = &config.metric_collection { + // TODO: Add gc regardles of the metric collection being enabled. maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); + client_tasks.spawn(usage_metrics::task_backup( + &metrics_config.backup_metric_collection_config, + cancellation_token, + )); } if let auth::BackendType::Console(api, _) = &config.auth_backend { @@ -385,13 +409,19 @@ async fn main() -> anyhow::Result<()> { if let Some(redis_notifications_client) = redis_notifications_client { let cache = api.caches.project_info.clone(); maintenance_tasks.spawn(notifications::task_main( - redis_notifications_client.clone(), + redis_notifications_client, cache.clone(), cancel_map.clone(), args.region.clone(), )); maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); } + if let Some(regional_redis_client) = regional_redis_client { + let cache = api.caches.endpoints_cache.clone(); + let con = regional_redis_client; + let span = tracing::info_span!("endpoints_cache"); + maintenance_tasks.spawn(async move { cache.do_read(con).await }.instrument(span)); + } } } @@ -434,6 +464,13 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { if args.allow_self_signed_compute { warn!("allowing self-signed compute certificates"); } + let backup_metric_collection_config = config::MetricBackupCollectionConfig { + interval: args.metric_backup_collection_interval, + remote_storage_config: remote_storage_from_toml( + &args.metric_backup_collection_remote_storage, + )?, + chunk_size: args.metric_backup_collection_chunk_size, + }; let metric_collection = match ( &args.metric_collection_endpoint, @@ -442,6 +479,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig { endpoint: endpoint.parse()?, interval: humantime::parse_duration(interval)?, + backup_metric_collection_config, }), (None, None) => None, _ => bail!( @@ -449,27 +487,27 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { and metric-collection-interval must be specified" ), }; - let rate_limiter_config = RateLimiterConfig { - disable: args.disable_dynamic_rate_limiter, - algorithm: args.rate_limit_algorithm, - timeout: args.rate_limiter_timeout, - initial_limit: args.initial_limit, - aimd_config: Some(args.aimd_config), - }; + if !args.disable_dynamic_rate_limiter { + bail!("dynamic rate limiter should be disabled"); + } let auth_backend = match &args.auth_backend { AuthBackend::Console => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!( "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); let caches = Box::leak(Box::new(console::caches::ApiCaches::new( wake_compute_cache_config, project_info_cache_config, + endpoint_cache_config, ))); let config::WakeComputeLockOptions { @@ -480,13 +518,20 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } = args.wake_compute_lock.parse()?; info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)"); let locks = Box::leak(Box::new( - console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout) - .unwrap(), + console::locks::ApiLocks::new( + "wake_compute_lock", + permits, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + ) + .unwrap(), )); - tokio::spawn(locks.garbage_collect_worker(epoch)); + tokio::spawn(locks.garbage_collect_worker()); let url = args.auth_endpoint.parse()?; - let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config)); + let endpoint = http::Endpoint::new(url, http::new_client()); let api = console::provider::neon::Api::new(endpoint, caches, locks); let api = console::provider::ConsoleBackend::Console(api); @@ -519,6 +564,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { scram_protocol_timeout: args.scram_protocol_timeout, rate_limiter_enabled: args.auth_rate_limit_enabled, rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), + rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, }; let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs index fc5f416395..d1d4087241 100644 --- a/proxy/src/cache.rs +++ b/proxy/src/cache.rs @@ -1,4 +1,5 @@ pub mod common; +pub mod endpoints; pub mod project_info; mod timed_lru; diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs new file mode 100644 index 0000000000..72543c6408 --- /dev/null +++ b/proxy/src/cache/endpoints.rs @@ -0,0 +1,233 @@ +use std::{ + convert::Infallible, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; + +use dashmap::DashSet; +use redis::{ + streams::{StreamReadOptions, StreamReadReply}, + AsyncCommands, FromRedisValue, Value, +}; +use serde::Deserialize; +use tokio::sync::Mutex; +use tracing::info; + +use crate::{ + config::EndpointCacheConfig, + context::RequestMonitoring, + intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}, + metrics::{Metrics, RedisErrors}, + rate_limiter::GlobalRateLimiter, + redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider, + EndpointId, +}; + +#[derive(Deserialize, Debug, Clone)] +pub struct ControlPlaneEventKey { + endpoint_created: Option, + branch_created: Option, + project_created: Option, +} +#[derive(Deserialize, Debug, Clone)] +struct EndpointCreated { + endpoint_id: String, +} +#[derive(Deserialize, Debug, Clone)] +struct BranchCreated { + branch_id: String, +} +#[derive(Deserialize, Debug, Clone)] +struct ProjectCreated { + project_id: String, +} + +pub struct EndpointsCache { + config: EndpointCacheConfig, + endpoints: DashSet, + branches: DashSet, + projects: DashSet, + ready: AtomicBool, + limiter: Arc>, +} + +impl EndpointsCache { + pub fn new(config: EndpointCacheConfig) -> Self { + Self { + limiter: Arc::new(Mutex::new(GlobalRateLimiter::new( + config.limiter_info.clone(), + ))), + config, + endpoints: DashSet::new(), + branches: DashSet::new(), + projects: DashSet::new(), + ready: AtomicBool::new(false), + } + } + pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool { + if !self.ready.load(Ordering::Acquire) { + return true; + } + // If cache is disabled, just collect the metrics and return. + if self.config.disable_cache { + let rejected = self.should_reject(endpoint); + ctx.set_rejected(rejected); + info!(?rejected, "check endpoint is valid, disabled cache"); + return true; + } + // If the limiter allows, we don't need to check the cache. + if self.limiter.lock().await.check() { + return true; + } + let rejected = self.should_reject(endpoint); + info!(?rejected, "check endpoint is valid, enabled cache"); + ctx.set_rejected(rejected); + !rejected + } + fn should_reject(&self, endpoint: &EndpointId) -> bool { + if endpoint.is_endpoint() { + !self.endpoints.contains(&EndpointIdInt::from(endpoint)) + } else if endpoint.is_branch() { + !self + .branches + .contains(&BranchIdInt::from(&endpoint.as_branch())) + } else { + !self + .projects + .contains(&ProjectIdInt::from(&endpoint.as_project())) + } + } + fn insert_event(&self, key: ControlPlaneEventKey) { + // Do not do normalization here, we expect the events to be normalized. + if let Some(endpoint_created) = key.endpoint_created { + self.endpoints + .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into())); + } + if let Some(branch_created) = key.branch_created { + self.branches + .insert(BranchIdInt::from(&branch_created.branch_id.into())); + } + if let Some(project_created) = key.project_created { + self.projects + .insert(ProjectIdInt::from(&project_created.project_id.into())); + } + } + pub async fn do_read( + &self, + mut con: ConnectionWithCredentialsProvider, + ) -> anyhow::Result { + let mut last_id = "0-0".to_string(); + loop { + self.ready.store(false, Ordering::Release); + if let Err(e) = con.connect().await { + tracing::error!("error connecting to redis: {:?}", e); + continue; + } + if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await { + tracing::error!("error reading from redis: {:?}", e); + } + tokio::time::sleep(self.config.retry_interval).await; + } + } + async fn read_from_stream( + &self, + con: &mut ConnectionWithCredentialsProvider, + last_id: &mut String, + ) -> anyhow::Result<()> { + tracing::info!("reading endpoints/branches/projects from redis"); + self.batch_read( + con, + StreamReadOptions::default().count(self.config.initial_batch_size), + last_id, + true, + ) + .await?; + tracing::info!("ready to filter user requests"); + self.ready.store(true, Ordering::Release); + self.batch_read( + con, + StreamReadOptions::default() + .count(self.config.default_batch_size) + .block(self.config.xread_timeout.as_millis() as usize), + last_id, + false, + ) + .await + } + fn parse_key_value(value: &Value) -> anyhow::Result { + let s: String = FromRedisValue::from_redis_value(value)?; + Ok(serde_json::from_str(&s)?) + } + async fn batch_read( + &self, + conn: &mut ConnectionWithCredentialsProvider, + opts: StreamReadOptions, + last_id: &mut String, + return_when_finish: bool, + ) -> anyhow::Result<()> { + let mut total: usize = 0; + loop { + let mut res: StreamReadReply = conn + .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts) + .await?; + + if res.keys.is_empty() { + if return_when_finish { + if total != 0 { + break; + } + anyhow::bail!( + "Redis stream {} is empty, cannot be used to filter endpoints", + self.config.stream_name + ); + } + // If we are not returning when finish, we should wait for more data. + continue; + } + if res.keys.len() != 1 { + anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name); + } + + let res = res.keys.pop().expect("Checked length above"); + let len = res.ids.len(); + for x in res.ids { + total += 1; + for (_, v) in x.map { + let key = match Self::parse_key_value(&v) { + Ok(x) => x, + Err(e) => { + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: &self.config.stream_name, + }); + tracing::error!("error parsing value {v:?}: {e:?}"); + continue; + } + }; + self.insert_event(key); + } + if total.is_power_of_two() { + tracing::debug!("endpoints read {}", total); + } + *last_id = x.id; + } + if return_when_finish && len <= self.config.default_batch_size { + break; + } + } + tracing::info!("read {} endpoints/branches/projects from redis", total); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::ControlPlaneEventKey; + + #[test] + fn test() { + let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}"; + let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap(); + } +} diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 5a3660520b..d8a1d261ce 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -16,7 +16,7 @@ use crate::{ config::ProjectInfoCacheOptions, console::AuthSecret, intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}, - EndpointId, ProjectId, RoleName, + EndpointId, RoleName, }; use super::{Cache, Cached}; @@ -214,14 +214,11 @@ impl ProjectInfoCacheImpl { } pub fn insert_role_secret( &self, - project_id: &ProjectId, - endpoint_id: &EndpointId, - role_name: &RoleName, + project_id: ProjectIdInt, + endpoint_id: EndpointIdInt, + role_name: RoleNameInt, secret: Option, ) { - let project_id = ProjectIdInt::from(project_id); - let endpoint_id = EndpointIdInt::from(endpoint_id); - let role_name = RoleNameInt::from(role_name); if self.cache.len() >= self.config.size { // If there are too many entries, wait until the next gc cycle. return; @@ -234,12 +231,10 @@ impl ProjectInfoCacheImpl { } pub fn insert_allowed_ips( &self, - project_id: &ProjectId, - endpoint_id: &EndpointId, + project_id: ProjectIdInt, + endpoint_id: EndpointIdInt, allowed_ips: Arc>, ) { - let project_id = ProjectIdInt::from(project_id); - let endpoint_id = EndpointIdInt::from(endpoint_id); if self.cache.len() >= self.config.size { // If there are too many entries, wait until the next gc cycle. return; @@ -358,7 +353,7 @@ impl Cache for ProjectInfoCacheImpl { #[cfg(test)] mod tests { use super::*; - use crate::scram::ServerSecret; + use crate::{scram::ServerSecret, ProjectId}; #[tokio::test] async fn test_project_info_cache_settings() { @@ -369,8 +364,8 @@ mod tests { ttl: Duration::from_secs(1), gc_interval: Duration::from_secs(600), }); - let project_id = "project".into(); - let endpoint_id = "endpoint".into(); + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); @@ -379,9 +374,23 @@ mod tests { "127.0.0.1".parse().unwrap(), "127.0.0.2".parse().unwrap(), ]); - cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); - cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); - cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); assert!(cached.cached()); @@ -393,7 +402,12 @@ mod tests { // Shouldn't add more than 2 roles. let user3: RoleName = "user3".into(); let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32]))); - cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user3).into(), + secret3.clone(), + ); assert!(cache.get_role_secret(&endpoint_id, &user3).is_none()); let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); @@ -421,8 +435,8 @@ mod tests { cache.clone().disable_ttl(); tokio::time::advance(Duration::from_secs(2)).await; - let project_id = "project".into(); - let endpoint_id = "endpoint".into(); + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); @@ -431,9 +445,23 @@ mod tests { "127.0.0.1".parse().unwrap(), "127.0.0.2".parse().unwrap(), ]); - cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); - cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); - cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); tokio::time::advance(Duration::from_secs(2)).await; // Nothing should be invalidated. @@ -470,8 +498,8 @@ mod tests { gc_interval: Duration::from_secs(600), })); - let project_id = "project".into(); - let endpoint_id = "endpoint".into(); + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); @@ -480,10 +508,20 @@ mod tests { "127.0.0.1".parse().unwrap(), "127.0.0.2".parse().unwrap(), ]); - cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); cache.clone().disable_ttl(); tokio::time::advance(Duration::from_millis(100)).await; - cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); // Added before ttl was disabled + ttl should be still cached. let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); @@ -497,7 +535,11 @@ mod tests { assert!(cache.get_role_secret(&endpoint_id, &user2).is_none()); // Added after ttl was disabled + ttl should not be cached. - cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); assert!(!cached.cached()); diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 6151513614..34512e9f5b 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -10,7 +10,7 @@ use uuid::Uuid; use crate::{ error::ReportableError, - metrics::NUM_CANCELLATION_REQUESTS, + metrics::{CancellationRequest, CancellationSource, Metrics}, redis::cancellation_publisher::{ CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, }, @@ -28,7 +28,7 @@ pub struct CancellationHandler

{ client: P, /// This field used for the monitoring purposes. /// Represents the source of the cancellation request. - from: &'static str, + from: CancellationSource, } #[derive(Debug, Error)] @@ -89,9 +89,13 @@ impl CancellationHandler

{ // NB: we should immediately release the lock after cloning the token. let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { tracing::warn!("query cancellation key not found: {key}"); - NUM_CANCELLATION_REQUESTS - .with_label_values(&[self.from, "not_found"]) - .inc(); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::NotFound, + }); match self.client.try_publish(key, session_id).await { Ok(()) => {} // do nothing Err(e) => { @@ -103,9 +107,13 @@ impl CancellationHandler

{ } return Ok(()); }; - NUM_CANCELLATION_REQUESTS - .with_label_values(&[self.from, "found"]) - .inc(); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::Found, + }); info!("cancelling query per user's request using key {key}"); cancel_closure.try_cancel_query().await } @@ -122,7 +130,7 @@ impl CancellationHandler

{ } impl CancellationHandler<()> { - pub fn new(map: CancelMap, from: &'static str) -> Self { + pub fn new(map: CancelMap, from: CancellationSource) -> Self { Self { map, client: (), @@ -132,7 +140,7 @@ impl CancellationHandler<()> { } impl CancellationHandler>>> { - pub fn new(map: CancelMap, client: Option>>, from: &'static str) -> Self { + pub fn new(map: CancelMap, client: Option>>, from: CancellationSource) -> Self { Self { map, client, from } } } @@ -192,15 +200,13 @@ impl

Drop for Session

{ #[cfg(test)] mod tests { - use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS; - use super::*; #[tokio::test] async fn check_session_drop() -> anyhow::Result<()> { let cancellation_handler = Arc::new(CancellationHandler::<()>::new( CancelMap::default(), - NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, + CancellationSource::FromRedis, )); let session = cancellation_handler.clone().get_session(); @@ -214,7 +220,7 @@ mod tests { #[tokio::test] async fn cancel_session_noop_regression() { - let handler = CancellationHandler::<()>::new(Default::default(), "local"); + let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local); handler .cancel_session( CancelKeyData { diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 65153babcb..149a619316 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -4,12 +4,11 @@ use crate::{ console::{errors::WakeComputeError, messages::MetricsAuxInfo}, context::RequestMonitoring, error::{ReportableError, UserFacingError}, - metrics::NUM_DB_CONNECTIONS_GAUGE, + metrics::{Metrics, NumDbConnectionsGuard}, proxy::neon_option, }; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; -use metrics::IntCounterPairGuard; use pq_proto::StartupMessageParams; use std::{io, net::SocketAddr, time::Duration}; use thiserror::Error; @@ -249,7 +248,7 @@ pub struct PostgresConnection { /// Labels for proxy's metrics. pub aux: MetricsAuxInfo, - _guage: IntCounterPairGuard, + _guage: NumDbConnectionsGuard<'static>, } impl ConnCfg { @@ -276,6 +275,7 @@ impl ConnCfg { let stream = connection.stream.into_inner(); info!( + cold_start_info = ctx.cold_start_info.as_str(), "connected to compute node at {host} ({socket_addr}) sslmode={:?}", self.0.get_ssl_mode() ); @@ -294,9 +294,7 @@ impl ConnCfg { params, cancel_closure, aux, - _guage: NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(), + _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol), }; Ok(connection) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 361c3ef519..7b4c02393b 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,10 +1,11 @@ use crate::{ - auth, - rate_limiter::{AuthRateLimiter, RateBucketInfo}, + auth::{self, backend::AuthRateLimiter}, + rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions, }; use anyhow::{bail, ensure, Context, Ok}; use itertools::Itertools; +use remote_storage::RemoteStorageConfig; use rustls::{ crypto::ring::sign, pki_types::{CertificateDer, PrivateKeyDer}, @@ -39,6 +40,7 @@ pub struct ProxyConfig { pub struct MetricCollectionConfig { pub endpoint: reqwest::Url, pub interval: Duration, + pub backup_metric_collection_config: MetricBackupCollectionConfig, } pub struct TlsConfig { @@ -56,6 +58,7 @@ pub struct AuthenticationConfig { pub scram_protocol_timeout: tokio::time::Duration, pub rate_limiter_enabled: bool, pub rate_limiter: AuthRateLimiter, + pub rate_limit_ip_subnet: u8, } impl TlsConfig { @@ -311,6 +314,95 @@ impl CertResolver { } } +#[derive(Debug)] +pub struct EndpointCacheConfig { + /// Batch size to receive all endpoints on the startup. + pub initial_batch_size: usize, + /// Batch size to receive endpoints. + pub default_batch_size: usize, + /// Timeouts for the stream read operation. + pub xread_timeout: Duration, + /// Stream name to read from. + pub stream_name: String, + /// Limiter info (to distinguish when to enable cache). + pub limiter_info: Vec, + /// Disable cache. + /// If true, cache is ignored, but reports all statistics. + pub disable_cache: bool, + /// Retry interval for the stream read operation. + pub retry_interval: Duration, +} + +impl EndpointCacheConfig { + /// Default options for [`crate::console::provider::NodeInfoCache`]. + /// Notice that by default the limiter is empty, which means that cache is disabled. + pub const CACHE_DEFAULT_OPTIONS: &'static str = + "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s"; + + /// Parse cache options passed via cmdline. + /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. + fn parse(options: &str) -> anyhow::Result { + let mut initial_batch_size = None; + let mut default_batch_size = None; + let mut xread_timeout = None; + let mut stream_name = None; + let mut limiter_info = vec![]; + let mut disable_cache = false; + let mut retry_interval = None; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "initial_batch_size" => initial_batch_size = Some(value.parse()?), + "default_batch_size" => default_batch_size = Some(value.parse()?), + "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?), + "stream_name" => stream_name = Some(value.to_string()), + "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?), + "disable_cache" => disable_cache = value.parse()?, + "retry_interval" => retry_interval = Some(humantime::parse_duration(value)?), + unknown => bail!("unknown key: {unknown}"), + } + } + RateBucketInfo::validate(&mut limiter_info)?; + + Ok(Self { + initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?, + default_batch_size: default_batch_size.context("missing `default_batch_size`")?, + xread_timeout: xread_timeout.context("missing `xread_timeout`")?, + stream_name: stream_name.context("missing `stream_name`")?, + disable_cache, + limiter_info, + retry_interval: retry_interval.context("missing `retry_interval`")?, + }) + } +} + +impl FromStr for EndpointCacheConfig { + type Err = anyhow::Error; + + fn from_str(options: &str) -> Result { + let error = || format!("failed to parse endpoint cache options '{options}'"); + Self::parse(options).with_context(error) + } +} +#[derive(Debug)] +pub struct MetricBackupCollectionConfig { + pub interval: Duration, + pub remote_storage_config: OptRemoteStorageConfig, + pub chunk_size: usize, +} + +/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get +/// runtime type errors from the value parser we use. +pub type OptRemoteStorageConfig = Option; + +pub fn remote_storage_from_toml(s: &str) -> anyhow::Result { + RemoteStorageConfig::from_toml(&s.parse()?) +} + /// Helper for cmdline cache options parsing. #[derive(Debug)] pub struct CacheOptions { diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 102076f2c6..9869b95768 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,9 +1,10 @@ +use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; use std::fmt; use crate::auth::IpPattern; -use crate::{BranchId, EndpointId, ProjectId}; +use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; /// Generic error response with human-readable description. /// Note that we can't always present it to user as is. @@ -18,7 +19,7 @@ pub struct ConsoleError { pub struct GetRoleSecret { pub role_secret: Box, pub allowed_ips: Option>, - pub project_id: Option, + pub project_id: Option, } // Manually implement debug to omit sensitive info. @@ -93,22 +94,49 @@ impl fmt::Debug for DatabaseInfo { /// Various labels for prometheus metrics. /// Also known as `ProxyMetricsAuxInfo` in the console. -#[derive(Debug, Deserialize, Clone, Default)] +#[derive(Debug, Deserialize, Clone)] pub struct MetricsAuxInfo { - pub endpoint_id: EndpointId, - pub project_id: ProjectId, - pub branch_id: BranchId, - pub cold_start_info: Option, + pub endpoint_id: EndpointIdInt, + pub project_id: ProjectIdInt, + pub branch_id: BranchIdInt, + #[serde(default)] + pub cold_start_info: ColdStartInfo, } -#[derive(Debug, Default, Serialize, Deserialize, Clone)] +#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)] #[serde(rename_all = "snake_case")] pub enum ColdStartInfo { #[default] - Unknown = 0, - Warm = 1, - PoolHit = 2, - PoolMiss = 3, + Unknown, + /// Compute was already running + Warm, + #[serde(rename = "pool_hit")] + #[label(rename = "pool_hit")] + /// Compute was not running but there was an available VM + VmPoolHit, + #[serde(rename = "pool_miss")] + #[label(rename = "pool_miss")] + /// Compute was not running and there were no VMs available + VmPoolMiss, + + // not provided by control plane + /// Connection available from HTTP pool + HttpPoolHit, + /// Cached connection info + WarmCached, +} + +impl ColdStartInfo { + pub fn as_str(&self) -> &'static str { + match self { + ColdStartInfo::Unknown => "unknown", + ColdStartInfo::Warm => "warm", + ColdStartInfo::VmPoolHit => "pool_hit", + ColdStartInfo::VmPoolMiss => "pool_miss", + ColdStartInfo::HttpPoolHit => "http_pool_hit", + ColdStartInfo::WarmCached => "warm_cached", + } + } } #[cfg(test)] diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 69bfd6b045..3fa7221f98 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -8,11 +8,13 @@ use crate::{ backend::{ComputeCredentialKeys, ComputeUserInfo}, IpPattern, }, - cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru}, + cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, compute, - config::{CacheOptions, ProjectInfoCacheOptions}, + config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, context::RequestMonitoring, - scram, EndpointCacheKey, ProjectId, + intern::ProjectIdInt, + metrics::ApiLockMetrics, + scram, EndpointCacheKey, }; use dashmap::DashMap; use std::{sync::Arc, time::Duration}; @@ -271,7 +273,7 @@ pub struct AuthInfo { /// List of IP addresses allowed for the autorization. pub allowed_ips: Vec, /// Project ID. This is used for cache invalidation. - pub project_id: Option, + pub project_id: Option, } /// Info for establishing a connection to a compute node. @@ -415,12 +417,15 @@ pub struct ApiCaches { pub node_info: NodeInfoCache, /// Cache which stores project_id -> endpoint_ids mapping. pub project_info: Arc, + /// List of all valid endpoints. + pub endpoints_cache: Arc, } impl ApiCaches { pub fn new( wake_compute_cache_config: CacheOptions, project_info_cache_config: ProjectInfoCacheOptions, + endpoint_cache_config: EndpointCacheConfig, ) -> Self { Self { node_info: NodeInfoCache::new( @@ -430,6 +435,7 @@ impl ApiCaches { true, ), project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)), + endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)), } } } @@ -440,10 +446,8 @@ pub struct ApiLocks { node_locks: DashMap>, permits: usize, timeout: Duration, - registered: prometheus::IntCounter, - unregistered: prometheus::IntCounter, - reclamation_lag: prometheus::Histogram, - lock_acquire_lag: prometheus::Histogram, + epoch: std::time::Duration, + metrics: &'static ApiLockMetrics, } impl ApiLocks { @@ -452,54 +456,16 @@ impl ApiLocks { permits: usize, shards: usize, timeout: Duration, + epoch: std::time::Duration, + metrics: &'static ApiLockMetrics, ) -> prometheus::Result { - let registered = prometheus::IntCounter::with_opts( - prometheus::Opts::new( - "semaphores_registered", - "Number of semaphores registered in this api lock", - ) - .namespace(name), - )?; - prometheus::register(Box::new(registered.clone()))?; - let unregistered = prometheus::IntCounter::with_opts( - prometheus::Opts::new( - "semaphores_unregistered", - "Number of semaphores unregistered in this api lock", - ) - .namespace(name), - )?; - prometheus::register(Box::new(unregistered.clone()))?; - let reclamation_lag = prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "reclamation_lag_seconds", - "Time it takes to reclaim unused semaphores in the api lock", - ) - .namespace(name) - // 1us -> 65ms - // benchmarks on my mac indicate it's usually in the range of 256us and 512us - .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?), - )?; - prometheus::register(Box::new(reclamation_lag.clone()))?; - let lock_acquire_lag = prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "semaphore_acquire_seconds", - "Time it takes to reclaim unused semaphores in the api lock", - ) - .namespace(name) - // 0.1ms -> 6s - .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?), - )?; - prometheus::register(Box::new(lock_acquire_lag.clone()))?; - Ok(Self { name, node_locks: DashMap::with_shard_amount(shards), permits, timeout, - lock_acquire_lag, - registered, - unregistered, - reclamation_lag, + epoch, + metrics, }) } @@ -519,7 +485,7 @@ impl ApiLocks { self.node_locks .entry(key.clone()) .or_insert_with(|| { - self.registered.inc(); + self.metrics.semaphores_registered.inc(); Arc::new(Semaphore::new(self.permits)) }) .clone() @@ -527,20 +493,21 @@ impl ApiLocks { }; let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await; - self.lock_acquire_lag - .observe((Instant::now() - now).as_secs_f64()); + self.metrics + .semaphore_acquire_seconds + .observe(now.elapsed().as_secs_f64()); Ok(WakeComputePermit { permit: Some(permit??), }) } - pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) { + pub async fn garbage_collect_worker(&self) { if self.permits == 0 { return; } - - let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32); + let mut interval = + tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32); loop { for (i, shard) in self.node_locks.shards().iter().enumerate() { interval.tick().await; @@ -553,13 +520,13 @@ impl ApiLocks { "performing epoch reclamation on api lock" ); let mut lock = shard.write(); - let timer = self.reclamation_lag.start_timer(); + let timer = self.metrics.reclamation_lag_seconds.start_timer(); let count = lock .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1) .count(); drop(lock); - self.unregistered.inc_by(count as u64); - timer.observe_duration() + self.metrics.semaphores_unregistered.inc_by(count as u64); + timer.observe(); } } } diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index b759c81373..cfe491f2aa 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -4,10 +4,16 @@ use super::{ errors::{ApiError, GetAuthInfoError, WakeComputeError}, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, }; -use crate::console::provider::{CachedAllowedIps, CachedRoleSecret}; use crate::context::RequestMonitoring; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; use crate::{auth::IpPattern, cache::Cached}; +use crate::{ + console::{ + messages::MetricsAuxInfo, + provider::{CachedAllowedIps, CachedRoleSecret}, + }, + BranchId, EndpointId, ProjectId, +}; use futures::TryFutureExt; use std::{str::FromStr, sync::Arc}; use thiserror::Error; @@ -114,7 +120,12 @@ impl Api { let node = NodeInfo { config, - aux: Default::default(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, allow_self_signed_compute: false, }; diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 289b0c08f7..138acdf578 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -7,13 +7,14 @@ use super::{ NodeInfo, }; use crate::{ - auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram, -}; -use crate::{ - cache::Cached, - context::RequestMonitoring, - metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}, + auth::backend::ComputeUserInfo, + compute, + console::messages::ColdStartInfo, + http, + metrics::{CacheOutcome, Metrics}, + scram, Normalize, }; +use crate::{cache::Cached, context::RequestMonitoring}; use futures::TryFutureExt; use std::sync::Arc; use tokio::time::Instant; @@ -23,7 +24,7 @@ use tracing::{error, info, info_span, warn, Instrument}; pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, - locks: &'static ApiLocks, + pub locks: &'static ApiLocks, jwt: String, } @@ -55,6 +56,15 @@ impl Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { + if !self + .caches + .endpoints_cache + .is_valid(ctx, &user_info.endpoint.normalize()) + .await + { + info!("endpoint is not valid, skipping the request"); + return Ok(AuthInfo::default()); + } let request_id = ctx.session_id.to_string(); let application_name = ctx.console_application_name(); async { @@ -81,7 +91,9 @@ impl Api { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. Err(e) => match e.http_status_code() { - Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()), + Some(http::StatusCode::NOT_FOUND) => { + return Ok(AuthInfo::default()); + } _otherwise => return Err(e.into()), }, }; @@ -95,7 +107,10 @@ impl Api { Some(secret) }; let allowed_ips = body.allowed_ips.unwrap_or_default(); - ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64); + Metrics::get() + .proxy + .allowed_ips_number + .observe(allowed_ips.len() as f64); Ok(AuthInfo { secret, allowed_ips, @@ -174,22 +189,27 @@ impl super::Api for Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let ep = &user_info.endpoint; + let normalized_ep = &user_info.endpoint.normalize(); let user = &user_info.user; - if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) { + if let Some(role_secret) = self + .caches + .project_info + .get_role_secret(normalized_ep, user) + { return Ok(role_secret); } let auth_info = self.do_get_auth_info(ctx, user_info).await?; if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( - &project_id, - ep, - user, + project_id, + normalized_ep_int, + user.into(), auth_info.secret.clone(), ); self.caches.project_info.insert_allowed_ips( - &project_id, - ep, + project_id, + normalized_ep_int, Arc::new(auth_info.allowed_ips), ); ctx.set_project_id(project_id); @@ -203,29 +223,34 @@ impl super::Api for Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { - let ep = &user_info.endpoint; - if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) { - ALLOWED_IPS_BY_CACHE_OUTCOME - .with_label_values(&["hit"]) - .inc(); + let normalized_ep = &user_info.endpoint.normalize(); + if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Hit); return Ok((allowed_ips, None)); } - ALLOWED_IPS_BY_CACHE_OUTCOME - .with_label_values(&["miss"]) - .inc(); + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Miss); let auth_info = self.do_get_auth_info(ctx, user_info).await?; let allowed_ips = Arc::new(auth_info.allowed_ips); let user = &user_info.user; if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( - &project_id, - ep, - user, + project_id, + normalized_ep_int, + user.into(), auth_info.secret.clone(), ); - self.caches - .project_info - .insert_allowed_ips(&project_id, ep, allowed_ips.clone()); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + allowed_ips.clone(), + ); ctx.set_project_id(project_id); } Ok(( @@ -248,8 +273,7 @@ impl super::Api for Api { // which means that we might cache it to reduce the load and latency. if let Some(cached) = self.caches.node_info.get(&key) { info!(key = &*key, "found cached compute node info"); - info!("cold_start_info=warm"); - ctx.set_cold_start_info(ColdStartInfo::Warm); + ctx.set_project(cached.aux.clone()); return Ok(cached); } @@ -260,17 +284,21 @@ impl super::Api for Api { if permit.should_check_cache() { if let Some(cached) = self.caches.node_info.get(&key) { info!(key = &*key, "found cached compute node info"); - info!("cold_start_info=warm"); - ctx.set_cold_start_info(ColdStartInfo::Warm); + ctx.set_project(cached.aux.clone()); return Ok(cached); } } - let node = self.do_wake_compute(ctx, user_info).await?; + let mut node = self.do_wake_compute(ctx, user_info).await?; ctx.set_project(node.aux.clone()); - let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default(); - info!(?cold_start_info, "woken up a compute node"); - let (_, cached) = self.caches.node_info.insert(key.clone(), node); + let cold_start_info = node.aux.cold_start_info; + info!("woken up a compute node"); + + // store the cached node as 'warm' + node.aux.cold_start_info = ColdStartInfo::WarmCached; + let (_, mut cached) = self.caches.node_info.insert(key.clone(), node); + cached.aux.cold_start_info = cold_start_info; + info!(key = &*key, "created a cache entry for compute node info"); Ok(cached) diff --git a/proxy/src/context.rs b/proxy/src/context.rs index 7ca830cdb4..8cd3024fcf 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -5,14 +5,15 @@ use once_cell::sync::OnceCell; use smol_str::SmolStr; use std::net::IpAddr; use tokio::sync::mpsc; -use tracing::{field::display, info_span, Span}; +use tracing::{field::display, info, info_span, Span}; use uuid::Uuid; use crate::{ console::messages::{ColdStartInfo, MetricsAuxInfo}, error::ErrorKind, - metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND}, - BranchId, DbName, EndpointId, ProjectId, RoleName, + intern::{BranchIdInt, ProjectIdInt}, + metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol}, + DbName, EndpointId, RoleName, }; use self::parquet::RequestData; @@ -28,14 +29,14 @@ static LOG_CHAN: OnceCell> = OnceCell::ne pub struct RequestMonitoring { pub peer_addr: IpAddr, pub session_id: Uuid, - pub protocol: &'static str, + pub protocol: Protocol, first_packet: chrono::DateTime, region: &'static str, pub span: Span, // filled in as they are discovered - project: Option, - branch: Option, + project: Option, + branch: Option, endpoint_id: Option, dbname: Option, user: Option, @@ -43,12 +44,14 @@ pub struct RequestMonitoring { error_kind: Option, pub(crate) auth_method: Option, success: bool, - cold_start_info: Option, + pub(crate) cold_start_info: ColdStartInfo, // extra // This sender is here to keep the request monitoring channel open while requests are taking place. sender: Option>, pub latency_timer: LatencyTimer, + // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane. + rejected: bool, } #[derive(Clone, Debug)] @@ -64,7 +67,7 @@ impl RequestMonitoring { pub fn new( session_id: Uuid, peer_addr: IpAddr, - protocol: &'static str, + protocol: Protocol, region: &'static str, ) -> Self { let span = info_span!( @@ -73,6 +76,7 @@ impl RequestMonitoring { ?session_id, %peer_addr, ep = tracing::field::Empty, + role = tracing::field::Empty, ); Self { @@ -92,7 +96,8 @@ impl RequestMonitoring { error_kind: None, auth_method: None, success: false, - cold_start_info: None, + rejected: false, + cold_start_info: ColdStartInfo::Unknown, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), latency_timer: LatencyTimer::new(protocol), @@ -101,7 +106,7 @@ impl RequestMonitoring { #[cfg(test)] pub fn test() -> Self { - RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test") + RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test") } pub fn console_application_name(&self) -> String { @@ -112,27 +117,36 @@ impl RequestMonitoring { ) } + pub fn set_rejected(&mut self, rejected: bool) { + self.rejected = rejected; + } + pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { - self.cold_start_info = Some(info); + self.cold_start_info = info; + self.latency_timer.cold_start_info(info); } pub fn set_project(&mut self, x: MetricsAuxInfo) { - self.set_endpoint_id(x.endpoint_id); + if self.endpoint_id.is_none() { + self.set_endpoint_id(x.endpoint_id.as_str().into()) + } self.branch = Some(x.branch_id); self.project = Some(x.project_id); - self.cold_start_info = x.cold_start_info; + self.set_cold_start_info(x.cold_start_info); } - pub fn set_project_id(&mut self, project_id: ProjectId) { + pub fn set_project_id(&mut self, project_id: ProjectIdInt) { self.project = Some(project_id); } pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { - self.span.record("ep", display(&endpoint_id)); - crate::metrics::CONNECTING_ENDPOINTS - .with_label_values(&[self.protocol]) - .measure(&endpoint_id); - self.endpoint_id = Some(endpoint_id); + if self.endpoint_id.is_none() { + self.span.record("ep", display(&endpoint_id)); + let metric = &Metrics::get().proxy.connecting_endpoints; + let label = metric.with_labels(self.protocol); + metric.get_metric(label).measure(&endpoint_id); + self.endpoint_id = Some(endpoint_id); + } } pub fn set_application(&mut self, app: Option) { @@ -144,6 +158,7 @@ impl RequestMonitoring { } pub fn set_user(&mut self, user: RoleName) { + self.span.record("role", display(&user)); self.user = Some(user); } @@ -151,14 +166,22 @@ impl RequestMonitoring { self.auth_method = Some(auth_method); } + pub fn has_private_peer_addr(&self) -> bool { + match self.peer_addr { + IpAddr::V4(ip) => ip.is_private(), + _ => false, + } + } + pub fn set_error_kind(&mut self, kind: ErrorKind) { - ERROR_BY_KIND - .with_label_values(&[kind.to_metric_label()]) - .inc(); + // Do not record errors from the private address to metrics. + if !self.has_private_peer_addr() { + Metrics::get().proxy.errors_total.inc(kind); + } if let Some(ep) = &self.endpoint_id { - ENDPOINT_ERRORS_BY_KIND - .with_label_values(&[kind.to_metric_label()]) - .measure(ep); + let metric = &Metrics::get().proxy.endpoints_affected_by_errors; + let label = metric.with_labels(kind); + metric.get_metric(label).measure(ep); } self.error_kind = Some(kind); } @@ -172,6 +195,32 @@ impl RequestMonitoring { impl Drop for RequestMonitoring { fn drop(&mut self) { + let outcome = if self.success { + ConnectOutcome::Success + } else { + ConnectOutcome::Failed + }; + let rejected = self.rejected; + let ep = self + .endpoint_id + .as_ref() + .map(|x| x.as_str()) + .unwrap_or_default(); + // This makes sense only if cache is disabled + info!( + ?ep, + ?outcome, + ?rejected, + "check endpoint is valid with outcome" + ); + Metrics::get() + .proxy + .invalid_endpoints_total + .inc(InvalidEndpointsGroup { + protocol: self.protocol, + rejected: rejected.into(), + outcome, + }); if let Some(tx) = self.sender.take() { let _: Result<(), _> = tx.send(RequestData::from(&*self)); } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index a2be1c4186..e061216d15 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -13,12 +13,14 @@ use parquet::{ }, record::RecordWriter, }; -use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel}; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use tokio::{sync::mpsc, time}; use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; +use crate::config::{remote_storage_from_toml, OptRemoteStorageConfig}; + use super::{RequestMonitoring, LOG_CHAN}; #[derive(clap::Args, Clone, Debug)] @@ -50,21 +52,13 @@ pub struct ParquetUploadArgs { parquet_upload_compression: Compression, } -/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get -/// runtime type errors from the value parser we use. -type OptRemoteStorageConfig = Option; - -fn remote_storage_from_toml(s: &str) -> anyhow::Result { - RemoteStorageConfig::from_toml(&s.parse()?) -} - // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a upload fails, we log it at info-level, and retry. // But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN // level instead, as repeated failures can mean a more serious problem. If it // fails more than FAILED_UPLOAD_RETRIES times, we give up -pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; -pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; +pub const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; +pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; // the parquet crate leaves a lot to be desired... // what follows is an attempt to write parquet files with minimal allocs. @@ -93,7 +87,7 @@ pub struct RequestData { /// Or if we make it to proxy_pass success: bool, /// Indicates if the cplane started the new compute node for this request. - cold_start_info: Option<&'static str>, + cold_start_info: &'static str, /// Tracks time from session start (HTTP request/libpq TCP handshake) /// Through to success/failure duration_us: u64, @@ -117,16 +111,11 @@ impl From<&RequestMonitoring> for RequestData { super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus", super::AuthMethod::Cleartext => "cleartext", }), - protocol: value.protocol, + protocol: value.protocol.as_str(), region: value.region, error: value.error_kind.as_ref().map(|e| e.to_metric_label()), success: value.success, - cold_start_info: value.cold_start_info.as_ref().map(|x| match x { - crate::console::messages::ColdStartInfo::Unknown => "unknown", - crate::console::messages::ColdStartInfo::Warm => "warm", - crate::console::messages::ColdStartInfo::PoolHit => "pool_hit", - crate::console::messages::ColdStartInfo::PoolMiss => "pool_miss", - }), + cold_start_info: value.cold_start_info.as_str(), duration_us: SystemTime::from(value.first_packet) .elapsed() .unwrap_or_default() @@ -460,7 +449,7 @@ mod tests { region: "us-east-1", error: None, success: rng.gen(), - cold_start_info: Some("no"), + cold_start_info: "no", duration_us: rng.gen_range(0..30_000_000), } } @@ -530,15 +519,15 @@ mod tests { assert_eq!( file_stats, [ - (1314406, 3, 6000), - (1314399, 3, 6000), - (1314459, 3, 6000), - (1314416, 3, 6000), - (1314546, 3, 6000), - (1314388, 3, 6000), - (1314180, 3, 6000), - (1314416, 3, 6000), - (438359, 1, 2000) + (1314385, 3, 6000), + (1314378, 3, 6000), + (1314438, 3, 6000), + (1314395, 3, 6000), + (1314525, 3, 6000), + (1314367, 3, 6000), + (1314159, 3, 6000), + (1314395, 3, 6000), + (438352, 1, 2000) ] ); @@ -568,11 +557,11 @@ mod tests { assert_eq!( file_stats, [ - (1220668, 5, 10000), - (1226818, 5, 10000), - (1228612, 5, 10000), - (1227974, 5, 10000), - (1219252, 5, 10000) + (1220633, 5, 10000), + (1226783, 5, 10000), + (1228577, 5, 10000), + (1227939, 5, 10000), + (1219217, 5, 10000) ] ); @@ -604,11 +593,11 @@ mod tests { assert_eq!( file_stats, [ - (1206315, 5, 10000), - (1206046, 5, 10000), - (1206339, 5, 10000), - (1206327, 5, 10000), - (1206582, 5, 10000) + (1206280, 5, 10000), + (1206011, 5, 10000), + (1206304, 5, 10000), + (1206292, 5, 10000), + (1206547, 5, 10000) ] ); @@ -633,15 +622,15 @@ mod tests { assert_eq!( file_stats, [ - (1314406, 3, 6000), - (1314399, 3, 6000), - (1314459, 3, 6000), - (1314416, 3, 6000), - (1314546, 3, 6000), - (1314388, 3, 6000), - (1314180, 3, 6000), - (1314416, 3, 6000), - (438359, 1, 2000) + (1314385, 3, 6000), + (1314378, 3, 6000), + (1314438, 3, 6000), + (1314395, 3, 6000), + (1314525, 3, 6000), + (1314367, 3, 6000), + (1314159, 3, 6000), + (1314395, 3, 6000), + (438352, 1, 2000) ] ); @@ -678,7 +667,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)] + [(658823, 2, 3001), (658537, 2, 3000), (658333, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 4614f3913d..fdfe50a494 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -1,5 +1,7 @@ use std::{error::Error as StdError, fmt, io}; +use measured::FixedCardinalityLabel; + /// Upcast (almost) any error into an opaque [`io::Error`]. pub fn io_error(e: impl Into>) -> io::Error { io::Error::new(io::ErrorKind::Other, e) @@ -29,24 +31,29 @@ pub trait UserFacingError: ReportableError { } } -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)] +#[label(singleton = "type")] pub enum ErrorKind { /// Wrong password, unknown endpoint, protocol violation, etc... User, /// Network error between user and proxy. Not necessarily user error + #[label(rename = "clientdisconnect")] ClientDisconnect, /// Proxy self-imposed user rate limits + #[label(rename = "ratelimit")] RateLimit, /// Proxy self-imposed service-wise rate limits + #[label(rename = "serviceratelimit")] ServiceRateLimit, /// internal errors Service, /// Error communicating with control plane + #[label(rename = "controlplane")] ControlPlane, /// Postgres error diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 59e1492ed4..e20488e23c 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -13,13 +13,16 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio::time::Instant; use tracing::trace; -use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl}; +use crate::{ + metrics::{ConsoleRequest, Metrics}, + url::ApiUrl, +}; use reqwest_middleware::RequestBuilder; /// This is the preferred way to create new http clients, /// because it takes care of observability (OpenTelemetry). /// We deliberately don't want to replace this with a public static. -pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware { +pub fn new_client() -> ClientWithMiddleware { let client = reqwest::ClientBuilder::new() .dns_resolver(Arc::new(GaiResolver::default())) .connection_verbose(true) @@ -28,7 +31,6 @@ pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> Clien reqwest_middleware::ClientBuilder::new(client) .with(reqwest_tracing::TracingMiddleware::default()) - .with(rate_limiter::Limiter::new(rate_limiter_config)) .build() } @@ -90,13 +92,14 @@ impl Endpoint { /// Execute a [request](reqwest::Request). pub async fn execute(&self, request: Request) -> Result { - let path = request.url().path().to_string(); - let start = Instant::now(); - let res = self.client.execute(request).await; - CONSOLE_REQUEST_LATENCY - .with_label_values(&[&path]) - .observe(start.elapsed().as_secs_f64()); - res + let _timer = Metrics::get() + .proxy + .console_request_latency + .start_timer(ConsoleRequest { + request: request.url().path(), + }); + + self.client.execute(request).await } } diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index cbb17ebcb7..cae9eb5b97 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -1,30 +1,49 @@ use anyhow::{anyhow, bail}; -use hyper::{Body, Request, Response, StatusCode}; -use std::{convert::Infallible, net::TcpListener}; -use tracing::info; +use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode}; +use measured::{text::BufferedTextEncoder, MetricGroup}; +use metrics::NeonMetrics; +use std::{ + convert::Infallible, + net::TcpListener, + sync::{Arc, Mutex}, +}; +use tracing::{info, info_span}; use utils::http::{ - endpoint::{self, prometheus_metrics_handler, request_span}, + endpoint::{self, request_span}, error::ApiError, json::json_response, RouterBuilder, RouterService, }; +use crate::jemalloc; + async fn status_handler(_: Request) -> Result, ApiError> { json_response(StatusCode::OK, "") } -fn make_router() -> RouterBuilder { +fn make_router(metrics: AppMetrics) -> RouterBuilder { + let state = Arc::new(Mutex::new(PrometheusHandler { + encoder: BufferedTextEncoder::new(), + metrics, + })); + endpoint::make_router() - .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .get("/metrics", move |r| { + let state = state.clone(); + request_span(r, move |b| prometheus_metrics_handler(b, state)) + }) .get("/v1/status", status_handler) } -pub async fn task_main(http_listener: TcpListener) -> anyhow::Result { +pub async fn task_main( + http_listener: TcpListener, + metrics: AppMetrics, +) -> anyhow::Result { scopeguard::defer! { info!("http has shut down"); } - let service = || RouterService::new(make_router().build()?); + let service = || RouterService::new(make_router(metrics).build()?); hyper::Server::from_tcp(http_listener)? .serve(service().map_err(|e| anyhow!(e))?) @@ -32,3 +51,57 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result bail!("hyper server without shutdown handling cannot shutdown successfully"); } + +struct PrometheusHandler { + encoder: BufferedTextEncoder, + metrics: AppMetrics, +} + +#[derive(MetricGroup)] +pub struct AppMetrics { + #[metric(namespace = "jemalloc")] + pub jemalloc: Option, + #[metric(flatten)] + pub neon_metrics: NeonMetrics, + #[metric(flatten)] + pub proxy: &'static crate::metrics::Metrics, +} + +async fn prometheus_metrics_handler( + _req: Request, + state: Arc>, +) -> Result, ApiError> { + let started_at = std::time::Instant::now(); + + let span = info_span!("blocking"); + let body = tokio::task::spawn_blocking(move || { + let _span = span.entered(); + + let mut state = state.lock().unwrap(); + let PrometheusHandler { encoder, metrics } = &mut *state; + + metrics + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + + let body = encoder.finish(); + + tracing::info!( + bytes = body.len(), + elapsed_ms = started_at.elapsed().as_millis(), + "responded /metrics" + ); + + body + }) + .await + .unwrap(); + + let response = Response::builder() + .status(200) + .header(CONTENT_TYPE, "text/plain; version=0.0.4") + .body(Body::from(body)) + .unwrap(); + + Ok(response) +} diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index a6519bdff9..e38135dd22 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -160,6 +160,11 @@ impl From<&EndpointId> for EndpointIdInt { EndpointIdTag::get_interner().get_or_intern(value) } } +impl From for EndpointIdInt { + fn from(value: EndpointId) -> Self { + EndpointIdTag::get_interner().get_or_intern(&value) + } +} #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct BranchIdTag; @@ -175,6 +180,11 @@ impl From<&BranchId> for BranchIdInt { BranchIdTag::get_interner().get_or_intern(value) } } +impl From for BranchIdInt { + fn from(value: BranchId) -> Self { + BranchIdTag::get_interner().get_or_intern(&value) + } +} #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct ProjectIdTag; @@ -190,6 +200,11 @@ impl From<&ProjectId> for ProjectIdInt { ProjectIdTag::get_interner().get_or_intern(value) } } +impl From for ProjectIdInt { + fn from(value: ProjectId) -> Self { + ProjectIdTag::get_interner().get_or_intern(&value) + } +} #[cfg(test)] mod tests { diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs index ed20798d56..3243e6a140 100644 --- a/proxy/src/jemalloc.rs +++ b/proxy/src/jemalloc.rs @@ -1,27 +1,45 @@ -use std::time::Duration; +use std::marker::PhantomData; -use metrics::IntGauge; -use prometheus::{register_int_gauge_with_registry, Registry}; +use measured::{ + label::NoLabels, + metric::{ + gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder, + MetricEncoding, MetricFamilyEncoding, MetricType, + }, + text::TextEncoder, + LabelGroup, MetricGroup, +}; use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version}; pub struct MetricRecorder { epoch: epoch_mib, - active: stats::active_mib, - active_gauge: IntGauge, - allocated: stats::allocated_mib, - allocated_gauge: IntGauge, - mapped: stats::mapped_mib, - mapped_gauge: IntGauge, - metadata: stats::metadata_mib, - metadata_gauge: IntGauge, - resident: stats::resident_mib, - resident_gauge: IntGauge, - retained: stats::retained_mib, - retained_gauge: IntGauge, + inner: Metrics, +} + +#[derive(MetricGroup)] +struct Metrics { + active_bytes: JemallocGaugeFamily, + allocated_bytes: JemallocGaugeFamily, + mapped_bytes: JemallocGaugeFamily, + metadata_bytes: JemallocGaugeFamily, + resident_bytes: JemallocGaugeFamily, + retained_bytes: JemallocGaugeFamily, +} + +impl MetricGroup for MetricRecorder +where + Metrics: MetricGroup, +{ + fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> { + if self.epoch.advance().is_ok() { + self.inner.collect_group_into(enc)?; + } + Ok(()) + } } impl MetricRecorder { - pub fn new(registry: &Registry) -> Result { + pub fn new() -> Result { tracing::info!( config = config::malloc_conf::read()?, version = version::read()?, @@ -30,71 +48,69 @@ impl MetricRecorder { Ok(Self { epoch: epoch::mib()?, - active: stats::active::mib()?, - active_gauge: register_int_gauge_with_registry!( - "jemalloc_active_bytes", - "Total number of bytes in active pages allocated by the process", - registry - )?, - allocated: stats::allocated::mib()?, - allocated_gauge: register_int_gauge_with_registry!( - "jemalloc_allocated_bytes", - "Total number of bytes allocated by the process", - registry - )?, - mapped: stats::mapped::mib()?, - mapped_gauge: register_int_gauge_with_registry!( - "jemalloc_mapped_bytes", - "Total number of bytes in active extents mapped by the allocator", - registry - )?, - metadata: stats::metadata::mib()?, - metadata_gauge: register_int_gauge_with_registry!( - "jemalloc_metadata_bytes", - "Total number of bytes dedicated to jemalloc metadata", - registry - )?, - resident: stats::resident::mib()?, - resident_gauge: register_int_gauge_with_registry!( - "jemalloc_resident_bytes", - "Total number of bytes in physically resident data pages mapped by the allocator", - registry - )?, - retained: stats::retained::mib()?, - retained_gauge: register_int_gauge_with_registry!( - "jemalloc_retained_bytes", - "Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system", - registry - )?, - }) - } - - fn _poll(&self) -> Result<(), anyhow::Error> { - self.epoch.advance()?; - self.active_gauge.set(self.active.read()? as i64); - self.allocated_gauge.set(self.allocated.read()? as i64); - self.mapped_gauge.set(self.mapped.read()? as i64); - self.metadata_gauge.set(self.metadata.read()? as i64); - self.resident_gauge.set(self.resident.read()? as i64); - self.retained_gauge.set(self.retained.read()? as i64); - Ok(()) - } - - #[inline] - pub fn poll(&self) { - if let Err(error) = self._poll() { - tracing::warn!(%error, "Failed to poll jemalloc stats"); - } - } - - pub fn start(self) -> tokio::task::JoinHandle<()> { - tokio::task::spawn(async move { - let mut interval = tokio::time::interval(Duration::from_secs(15)); - interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - loop { - self.poll(); - interval.tick().await; - } + inner: Metrics { + active_bytes: JemallocGaugeFamily(stats::active::mib()?), + allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?), + mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?), + metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?), + resident_bytes: JemallocGaugeFamily(stats::resident::mib()?), + retained_bytes: JemallocGaugeFamily(stats::retained::mib()?), + }, }) } } + +struct JemallocGauge(PhantomData); + +impl Default for JemallocGauge { + fn default() -> Self { + JemallocGauge(PhantomData) + } +} +impl MetricType for JemallocGauge { + type Metadata = T; +} + +struct JemallocGaugeFamily(T); +impl MetricFamilyEncoding for JemallocGaugeFamily +where + JemallocGauge: MetricEncoding, +{ + fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> { + JemallocGauge::write_type(&name, enc)?; + JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc) + } +} + +macro_rules! jemalloc_gauge { + ($stat:ident, $mib:ident) => { + impl MetricEncoding> for JemallocGauge { + fn write_type( + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + GaugeState::write_type(name, enc) + } + + fn collect_into( + &self, + mib: &stats::$mib, + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + if let Ok(v) = mib.read() { + enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?; + } + Ok(()) + } + } + }; +} + +jemalloc_gauge!(active, active_mib); +jemalloc_gauge!(allocated, allocated_mib); +jemalloc_gauge!(mapped, mapped_mib); +jemalloc_gauge!(metadata, metadata_mib); +jemalloc_gauge!(resident, resident_mib); +jemalloc_gauge!(retained, retained_mib); diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index da7c7f3ed2..3f6d985fe8 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -127,6 +127,24 @@ macro_rules! smol_str_wrapper { }; } +const POOLER_SUFFIX: &str = "-pooler"; + +pub trait Normalize { + fn normalize(&self) -> Self; +} + +impl + From> Normalize for S { + fn normalize(&self) -> Self { + if self.as_ref().ends_with(POOLER_SUFFIX) { + let mut s = self.as_ref().to_string(); + s.truncate(s.len() - POOLER_SUFFIX.len()); + s.into() + } else { + self.clone() + } + } +} + // 90% of role name strings are 20 characters or less. smol_str_wrapper!(RoleName); // 50% of endpoint strings are 23 characters or less. @@ -140,3 +158,22 @@ smol_str_wrapper!(ProjectId); smol_str_wrapper!(EndpointCacheKey); smol_str_wrapper!(DbName); + +// Endpoints are a bit tricky. Rare they might be branches or projects. +impl EndpointId { + pub fn is_endpoint(&self) -> bool { + self.0.starts_with("ep-") + } + pub fn is_branch(&self) -> bool { + self.0.starts_with("br-") + } + pub fn is_project(&self) -> bool { + !self.is_endpoint() && !self.is_branch() + } + pub fn as_branch(&self) -> BranchId { + BranchId(self.0.clone()) + } + pub fn as_project(&self) -> ProjectId { + ProjectId(self.0.clone()) + } +} diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 4172dc19da..3a4e54aea0 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,171 +1,348 @@ -use ::metrics::{ - exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec, - register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, - register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec, - IntCounterVec, IntGauge, IntGaugeVec, -}; -use metrics::{ - register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter, - IntCounterPair, -}; +use std::sync::OnceLock; + +use lasso::ThreadedRodeo; +use measured::{ + label::StaticLabelSet, + metric::{histogram::Thresholds, name::MetricName}, + Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, + MetricGroup, +}; +use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; -use once_cell::sync::Lazy; use tokio::time::{self, Instant}; -pub static NUM_DB_CONNECTIONS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_db_connections_total", - "Number of opened connections to a database.", - "proxy_closed_db_connections_total", - "Number of closed connections to a database.", - &["protocol"], - ) - .unwrap() -}); +use crate::console::messages::ColdStartInfo; -pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_client_connections_total", - "Number of opened connections from a client.", - "proxy_closed_client_connections_total", - "Number of closed connections from a client.", - &["protocol"], - ) - .unwrap() -}); +#[derive(MetricGroup)] +pub struct Metrics { + #[metric(namespace = "proxy")] + pub proxy: ProxyMetrics, -pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_accepted_connections_total", - "Number of client connections accepted.", - "proxy_closed_connections_total", - "Number of client connections closed.", - &["protocol"], - ) - .unwrap() -}); + #[metric(namespace = "wake_compute_lock")] + pub wake_compute_lock: ApiLockMetrics, +} -pub static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_compute_connection_latency_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane - // 3 * 2 * 2 * 2 * 2 = 48 counters - &["protocol", "cache_miss", "pool_miss", "outcome", "excluded"], - // largest bucket = 2^16 * 0.5ms = 32s - exponential_buckets(0.0005, 2.0, 16).unwrap(), - ) - .unwrap() -}); +impl Metrics { + pub fn get() -> &'static Self { + static SELF: OnceLock = OnceLock::new(); + SELF.get_or_init(|| Metrics { + proxy: ProxyMetrics::default(), + wake_compute_lock: ApiLockMetrics::new(), + }) + } +} -pub static CONSOLE_REQUEST_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_console_request_latency", - "Time it took for proxy to establish a connection to the compute endpoint", - // proxy_wake_compute/proxy_get_role_info - &["request"], +#[derive(MetricGroup)] +#[metric(new())] +pub struct ProxyMetrics { + #[metric(flatten)] + pub db_connections: CounterPairVec, + #[metric(flatten)] + pub client_connections: CounterPairVec, + #[metric(flatten)] + pub connection_requests: CounterPairVec, + #[metric(flatten)] + pub http_endpoint_pools: HttpEndpointPools, + + /// Time it took for proxy to establish a connection to the compute endpoint. + // largest bucket = 2^16 * 0.5ms = 32s + #[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))] + pub compute_connection_latency_seconds: HistogramVec, + + /// Time it took for proxy to receive a response from control plane. + #[metric( // largest bucket = 2^16 * 0.2ms = 13s - exponential_buckets(0.0002, 2.0, 16).unwrap(), - ) - .unwrap() -}); + metadata = Thresholds::exponential_buckets(0.0002, 2.0), + )] + pub console_request_latency: HistogramVec, -pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_allowed_ips_cache_misses", - "Number of cache hits/misses for allowed ips", - // hit/miss - &["outcome"], - ) - .unwrap() -}); + /// Time it takes to acquire a token to call console plane. + // largest bucket = 3^16 * 0.05ms = 2.15s + #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))] + pub control_plane_token_acquire_seconds: Histogram<16>, -pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_control_plane_token_acquire_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // largest bucket = 3^16 * 0.05ms = 2.15s - exponential_buckets(0.00005, 3.0, 16).unwrap(), - ) - .unwrap() -}); + /// Size of the HTTP request body lengths. + // smallest bucket = 16 bytes + // largest bucket = 4^12 * 16 bytes = 256MB + #[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))] + pub http_conn_content_length_bytes: HistogramVec, 12>, -pub static RATE_LIMITER_LIMIT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "semaphore_control_plane_limit", - "Current limit of the semaphore control plane", - &["limit"], // 2 counters - ) - .unwrap() -}); + /// Time it takes to reclaim unused connection pools. + #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] + pub http_pool_reclaimation_lag_seconds: Histogram<16>, -pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_accepted_connections_by_sni", - "Number of connections (per sni).", - &["kind"], - ) - .unwrap() -}); + /// Number of opened connections to a database. + pub http_pool_opened_connections: Gauge, -pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_allowed_ips_number", - "Number of allowed ips", - vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0], - ) - .unwrap() -}); + /// Number of cache hits/misses for allowed ips. + pub allowed_ips_cache_misses: CounterVec>, -pub static HTTP_CONTENT_LENGTH: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_http_conn_content_length_bytes", - "Time it took for proxy to establish a connection to the compute endpoint", - // largest bucket = 3^16 * 0.05ms = 2.15s - exponential_buckets(8.0, 2.0, 20).unwrap() - ) - .unwrap() -}); + /// Number of allowed ips + #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] + pub allowed_ips_number: Histogram<10>, -pub static GC_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_http_pool_reclaimation_lag_seconds", - "Time it takes to reclaim unused connection pools", - // 1us -> 65ms - exponential_buckets(1e-6, 2.0, 16).unwrap(), - ) - .unwrap() -}); + /// Number of connections (per sni). + pub accepted_connections_by_sni: CounterVec>, -pub static ENDPOINT_POOLS: Lazy = Lazy::new(|| { - register_int_counter_pair!( - "proxy_http_pool_endpoints_registered_total", - "Number of endpoints we have registered pools for", - "proxy_http_pool_endpoints_unregistered_total", - "Number of endpoints we have unregistered pools for", - ) - .unwrap() -}); + /// Number of connection failures (per kind). + pub connection_failures_total: CounterVec>, -pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy = Lazy::new(|| { - register_int_gauge!( - "proxy_http_pool_opened_connections", - "Number of opened connections to a database.", - ) - .unwrap() -}); + /// Number of wake-up failures (per kind). + pub connection_failures_breakdown: CounterVec, -pub static NUM_CANCELLATION_REQUESTS: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_cancellation_requests_total", - "Number of cancellation requests (per found/not_found).", - &["source", "kind"], - ) - .unwrap() -}); + /// Number of bytes sent/received between all clients and backends. + pub io_bytes: CounterVec>, -pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client"; -pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis"; + /// Number of errors by a given classification. + pub errors_total: CounterVec>, + + /// Number of cancellation requests (per found/not_found). + pub cancellation_requests_total: CounterVec, + + /// Number of errors by a given classification + pub redis_errors_total: CounterVec, + + /// Number of TLS handshake failures + pub tls_handshake_failures: Counter, + + /// Number of connection requests affected by authentication rate limits + pub requests_auth_rate_limits_total: Counter, + + /// HLL approximate cardinality of endpoints that are connecting + pub connecting_endpoints: HyperLogLogVec, 32>, + + /// Number of endpoints affected by errors of a given classification + pub endpoints_affected_by_errors: HyperLogLogVec, 32>, + + /// Number of endpoints affected by authentication rate limits + pub endpoints_auth_rate_limits: HyperLogLog<32>, + + /// Number of invalid endpoints (per protocol, per rejected). + pub invalid_endpoints_total: CounterVec, +} + +#[derive(MetricGroup)] +#[metric(new())] +pub struct ApiLockMetrics { + /// Number of semaphores registered in this api lock + pub semaphores_registered: Counter, + /// Number of semaphores unregistered in this api lock + pub semaphores_unregistered: Counter, + /// Time it takes to reclaim unused semaphores in the api lock + #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] + pub reclamation_lag_seconds: Histogram<16>, + /// Time it takes to acquire a semaphore lock + #[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))] + pub semaphore_acquire_seconds: Histogram<16>, +} + +impl Default for ProxyMetrics { + fn default() -> Self { + Self::new() + } +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "direction")] +pub enum HttpDirection { + Request, + Response, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "direction")] +pub enum Direction { + Tx, + Rx, +} + +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +#[label(singleton = "protocol")] +pub enum Protocol { + Http, + Ws, + Tcp, + SniRouter, +} + +impl Protocol { + pub fn as_str(&self) -> &'static str { + match self { + Protocol::Http => "http", + Protocol::Ws => "ws", + Protocol::Tcp => "tcp", + Protocol::SniRouter => "sni_router", + } + } +} + +impl std::fmt::Display for Protocol { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum Bool { + True, + False, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "outcome")] +pub enum Outcome { + Success, + Failed, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "outcome")] +pub enum CacheOutcome { + Hit, + Miss, +} + +#[derive(LabelGroup)] +#[label(set = ConsoleRequestSet)] +pub struct ConsoleRequest<'a> { + #[label(dynamic_with = ThreadedRodeo, default)] + pub request: &'a str, +} + +#[derive(MetricGroup, Default)] +pub struct HttpEndpointPools { + /// Number of endpoints we have registered pools for + pub http_pool_endpoints_registered_total: Counter, + /// Number of endpoints we have unregistered pools for + pub http_pool_endpoints_unregistered_total: Counter, +} + +pub struct HttpEndpointPoolsGuard<'a> { + dec: &'a Counter, +} + +impl Drop for HttpEndpointPoolsGuard<'_> { + fn drop(&mut self) { + self.dec.inc(); + } +} + +impl HttpEndpointPools { + pub fn guard(&self) -> HttpEndpointPoolsGuard { + self.http_pool_endpoints_registered_total.inc(); + HttpEndpointPoolsGuard { + dec: &self.http_pool_endpoints_unregistered_total, + } + } +} +pub struct NumDbConnectionsGauge; +impl CounterPairAssoc for NumDbConnectionsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total"); + const INC_HELP: &'static str = "Number of opened connections to a database."; + const DEC_HELP: &'static str = "Number of closed connections to a database."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>; + +pub struct NumClientConnectionsGauge; +impl CounterPairAssoc for NumClientConnectionsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total"); + const INC_HELP: &'static str = "Number of opened connections from a client."; + const DEC_HELP: &'static str = "Number of closed connections from a client."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumClientConnectionsGuard<'a> = + metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>; + +pub struct NumConnectionRequestsGauge; +impl CounterPairAssoc for NumConnectionRequestsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total"); + const INC_HELP: &'static str = "Number of client connections accepted."; + const DEC_HELP: &'static str = "Number of client connections closed."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumConnectionRequestsGuard<'a> = + metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>; + +#[derive(LabelGroup)] +#[label(set = ComputeConnectionLatencySet)] +pub struct ComputeConnectionLatencyGroup { + protocol: Protocol, + cold_start_info: ColdStartInfo, + outcome: ConnectOutcome, + excluded: LatencyExclusions, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum LatencyExclusions { + Client, + ClientAndCplane, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum SniKind { + Sni, + NoSni, + PasswordHack, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum ConnectionFailureKind { + ComputeCached, + ComputeUncached, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum WakeupFailureKind { + BadComputeAddress, + ApiTransportError, + QuotaExceeded, + ApiConsoleLocked, + ApiConsoleBadRequest, + ApiConsoleOtherServerError, + ApiConsoleOtherError, + TimeoutError, +} + +#[derive(LabelGroup)] +#[label(set = ConnectionFailuresBreakdownSet)] +pub struct ConnectionFailuresBreakdownGroup { + pub kind: WakeupFailureKind, + pub retry: Bool, +} + +#[derive(LabelGroup, Copy, Clone)] +#[label(set = RedisErrorsSet)] +pub struct RedisErrors<'a> { + #[label(dynamic_with = ThreadedRodeo, default)] + pub channel: &'a str, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum CancellationSource { + FromClient, + FromRedis, + Local, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum CancellationOutcome { + NotFound, + Found, +} + +#[derive(LabelGroup)] +#[label(set = CancellationRequestSet)] +pub struct CancellationRequest { + pub source: CancellationSource, + pub kind: CancellationOutcome, +} pub enum Waiting { Cplane, @@ -188,10 +365,9 @@ pub struct LatencyTimer { // accumulated time on the stopwatch accumulated: Accumulated, // label data - protocol: &'static str, - cache_miss: bool, - pool_miss: bool, - outcome: &'static str, + protocol: Protocol, + cold_start_info: ColdStartInfo, + outcome: ConnectOutcome, } pub struct LatencyTimerPause<'a> { @@ -201,17 +377,15 @@ pub struct LatencyTimerPause<'a> { } impl LatencyTimer { - pub fn new(protocol: &'static str) -> Self { + pub fn new(protocol: Protocol) -> Self { Self { start: time::Instant::now(), stop: None, accumulated: Accumulated::default(), protocol, - cache_miss: false, - // by default we don't do pooling - pool_miss: true, + cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified - outcome: "failed", + outcome: ConnectOutcome::Failed, } } @@ -223,12 +397,8 @@ impl LatencyTimer { } } - pub fn cache_miss(&mut self) { - self.cache_miss = true; - } - - pub fn pool_hit(&mut self) { - self.pool_miss = false; + pub fn cold_start_info(&mut self, cold_start_info: ColdStartInfo) { + self.cold_start_info = cold_start_info; } pub fn success(&mut self) { @@ -236,7 +406,7 @@ impl LatencyTimer { self.stop = Some(time::Instant::now()); // success - self.outcome = "success"; + self.outcome = ConnectOutcome::Success; } } @@ -251,130 +421,62 @@ impl Drop for LatencyTimerPause<'_> { } } +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +pub enum ConnectOutcome { + Success, + Failed, +} + impl Drop for LatencyTimer { fn drop(&mut self) { let duration = self .stop .unwrap_or_else(time::Instant::now) .duration_since(self.start); - // Excluding cplane communication from the accumulated time. - COMPUTE_CONNECTION_LATENCY - .with_label_values(&[ - self.protocol, - bool_to_str(self.cache_miss), - bool_to_str(self.pool_miss), - self.outcome, - "client", - ]) - .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64()); + + let metric = &Metrics::get().proxy.compute_connection_latency_seconds; + + // Excluding client communication from the accumulated time. + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::Client, + }, + duration + .saturating_sub(self.accumulated.client) + .as_secs_f64(), + ); + // Exclude client and cplane communication from the accumulated time. let accumulated_total = self.accumulated.client + self.accumulated.cplane; - COMPUTE_CONNECTION_LATENCY - .with_label_values(&[ - self.protocol, - bool_to_str(self.cache_miss), - bool_to_str(self.pool_miss), - self.outcome, - "client_and_cplane", - ]) - .observe((duration.saturating_sub(accumulated_total)).as_secs_f64()); + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientAndCplane, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); } } -pub static NUM_CONNECTION_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_total", - "Number of connection failures (per kind).", - &["kind"], - ) - .unwrap() -}); - -pub static NUM_WAKEUP_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_breakdown", - "Number of wake-up failures (per kind).", - &["retry", "kind"], - ) - .unwrap() -}); - -pub static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_io_bytes", - "Number of bytes sent/received between all clients and backends.", - &["direction"], - ) - .unwrap() -}); - -pub const fn bool_to_str(x: bool) -> &'static str { - if x { - "true" - } else { - "false" +impl From for Bool { + fn from(value: bool) -> Self { + if value { + Bool::True + } else { + Bool::False + } } } -pub static CONNECTING_ENDPOINTS: Lazy> = Lazy::new(|| { - register_hll_vec!( - 32, - "proxy_connecting_endpoints", - "HLL approximate cardinality of endpoints that are connecting", - &["protocol"], - ) - .unwrap() -}); - -pub static ERROR_BY_KIND: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_errors_total", - "Number of errors by a given classification", - &["type"], - ) - .unwrap() -}); - -pub static ENDPOINT_ERRORS_BY_KIND: Lazy> = Lazy::new(|| { - register_hll_vec!( - 32, - "proxy_endpoints_affected_by_errors", - "Number of endpoints affected by errors of a given classification", - &["type"], - ) - .unwrap() -}); - -pub static REDIS_BROKEN_MESSAGES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_redis_errors_total", - "Number of errors by a given classification", - &["channel"], - ) - .unwrap() -}); - -pub static TLS_HANDSHAKE_FAILURES: Lazy = Lazy::new(|| { - register_int_counter!( - "proxy_tls_handshake_failures", - "Number of TLS handshake failures", - ) - .unwrap() -}); - -pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy> = Lazy::new(|| { - register_hll!( - 32, - "proxy_endpoints_auth_rate_limits", - "Number of endpoints affected by authentication rate limits", - ) - .unwrap() -}); - -pub static AUTH_RATE_LIMIT_HITS: Lazy = Lazy::new(|| { - register_int_counter!( - "proxy_requests_auth_rate_limits_total", - "Number of connection requests affected by authentication rate limits", - ) - .unwrap() -}); +#[derive(LabelGroup)] +#[label(set = InvalidEndpointsSet)] +pub struct InvalidEndpointsGroup { + pub protocol: Protocol, + pub rejected: Bool, + pub outcome: ConnectOutcome, +} diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 700c8c8681..70f9b4bfab 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -5,19 +5,13 @@ use std::{ io, net::SocketAddr, pin::{pin, Pin}, - sync::Mutex, task::{ready, Context, Poll}, }; use bytes::{Buf, BytesMut}; -use hyper::server::accept::Accept; -use hyper::server::conn::{AddrIncoming, AddrStream}; -use metrics::IntCounterPairGuard; +use hyper::server::conn::AddrIncoming; use pin_project_lite::pin_project; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; -use uuid::Uuid; - -use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE; pub struct ProxyProtocolAccept { pub incoming: AddrIncoming, @@ -331,103 +325,6 @@ impl AsyncRead for WithClientIp { } } -impl Accept for ProxyProtocolAccept { - type Conn = WithConnectionGuard>; - - type Error = io::Error; - - fn poll_accept( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?); - - let conn_id = uuid::Uuid::new_v4(); - let span = tracing::info_span!("http_conn", ?conn_id); - { - let _enter = span.enter(); - tracing::info!("accepted new TCP connection"); - } - - let Some(conn) = conn else { - return Poll::Ready(None); - }; - - Poll::Ready(Some(Ok(WithConnectionGuard { - inner: WithClientIp::new(conn), - connection_id: Uuid::new_v4(), - gauge: Mutex::new(Some( - NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&[self.protocol]) - .guard(), - )), - span, - }))) - } -} - -pin_project! { - pub struct WithConnectionGuard { - #[pin] - pub inner: T, - pub connection_id: Uuid, - pub gauge: Mutex>, - pub span: tracing::Span, - } - - impl PinnedDrop for WithConnectionGuard { - fn drop(this: Pin<&mut Self>) { - let _enter = this.span.enter(); - tracing::info!("HTTP connection closed") - } - } -} - -impl AsyncWrite for WithConnectionGuard { - #[inline] - fn poll_write( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &[u8], - ) -> Poll> { - self.project().inner.poll_write(cx, buf) - } - - #[inline] - fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_flush(cx) - } - - #[inline] - fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_shutdown(cx) - } - - #[inline] - fn poll_write_vectored( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - bufs: &[io::IoSlice<'_>], - ) -> Poll> { - self.project().inner.poll_write_vectored(cx, bufs) - } - - #[inline] - fn is_write_vectored(&self) -> bool { - self.inner.is_write_vectored() - } -} - -impl AsyncRead for WithConnectionGuard { - fn poll_read( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &mut ReadBuf<'_>, - ) -> Poll> { - self.project().inner.poll_read(cx, buf) - } -} - #[cfg(test)] mod tests { use std::pin::pin; diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 6051c0a812..f80ced91c8 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -7,6 +7,7 @@ pub mod handshake; pub mod passthrough; pub mod retry; pub mod wake_compute; +pub use copy_bidirectional::copy_bidirectional_client_compute; use crate::{ auth, @@ -15,16 +16,15 @@ use crate::{ config::{ProxyConfig, TlsConfig}, context::RequestMonitoring, error::ReportableError, - metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE}, + metrics::{Metrics, NumClientConnectionsGuard}, protocol2::WithClientIp, proxy::handshake::{handshake, HandshakeData}, rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, - EndpointCacheKey, + EndpointCacheKey, Normalize, }; use futures::TryFutureExt; use itertools::Itertools; -use metrics::IntCounterPairGuard; use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, StartupMessageParams}; use regex::Regex; @@ -79,9 +79,10 @@ pub async fn task_main( { let (socket, peer_addr) = accept_result?; - let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["tcp"]) - .guard(); + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Tcp); let session_id = uuid::Uuid::new_v4(); let cancellation_handler = Arc::clone(&cancellation_handler); @@ -113,7 +114,12 @@ pub async fn task_main( }, }; - let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region); + let mut ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Tcp, + &config.region, + ); let span = ctx.span.clone(); let res = handle_client( @@ -237,19 +243,23 @@ pub async fn handle_client( stream: S, mode: ClientMode, endpoint_rate_limiter: Arc, - conn_gauge: IntCounterPairGuard, + conn_gauge: NumClientConnectionsGuard<'static>, ) -> Result>, ClientRequestError> { - info!("handling interactive connection from client"); + info!( + protocol = %ctx.protocol, + "handling interactive connection from client" + ); + let metrics = &Metrics::get().proxy; let proto = ctx.protocol; - let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&[proto]) - .guard(); + // let _client_gauge = metrics.client_connections.guard(proto); + let _request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); + let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client); - let do_handshake = handshake(stream, mode.handshake_tls(tls)); + let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error); let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), @@ -280,7 +290,7 @@ pub async fn handle_client( // check rate limit if let Some(ep) = user_info.get_endpoint() { - if !endpoint_rate_limiter.check(ep, 1) { + if !endpoint_rate_limiter.check(ep.normalize(), 1) { return stream .throw_error(auth::AuthError::too_many_connections()) .await?; diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index c76e2ff6d9..33f394c550 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -4,7 +4,7 @@ use crate::{ console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo}, context::RequestMonitoring, error::ReportableError, - metrics::NUM_CONNECTION_FAILURES, + metrics::{ConnectionFailureKind, Metrics}, proxy::{ retry::{retry_after, ShouldRetry}, wake_compute::wake_compute, @@ -27,10 +27,10 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo { warn!("invalidating stalled compute node info cache entry"); } let label = match is_cached { - true => "compute_cached", - false => "compute_uncached", + true => ConnectionFailureKind::ComputeCached, + false => ConnectionFailureKind::ComputeUncached, }; - NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); + Metrics::get().proxy.connection_failures_total.inc(label); node_info.invalidate() } @@ -87,7 +87,6 @@ impl ConnectMechanism for TcpMechanism<'_> { } /// Try to connect to the compute node, retrying if necessary. -/// This function might update `node_info`, so we take it by `&mut`. #[tracing::instrument(skip_all)] pub async fn connect_to_compute( ctx: &mut RequestMonitoring, @@ -132,7 +131,6 @@ where } else { // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node info!("compute node's state has likely changed; requesting a wake-up"); - ctx.latency_timer.cache_miss(); let old_node_info = invalidate_cache(node_info); let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?; node_info.reuse_settings(old_node_info); diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 684be74f9a..4b09ebd8dc 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -41,7 +41,7 @@ where } #[tracing::instrument(skip_all)] -pub(super) async fn copy_bidirectional_client_compute( +pub async fn copy_bidirectional_client_compute( client: &mut Client, compute: &mut Compute, ) -> Result<(u64, u64), std::io::Error> diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index 4665e07d23..dd935cc245 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -63,6 +63,7 @@ pub enum HandshakeData { pub async fn handshake( stream: S, mut tls: Option<&TlsConfig>, + record_handshake_error: bool, ) -> Result, HandshakeError> { // Client may try upgrading to each protocol only once let (mut tried_ssl, mut tried_gss) = (false, false); @@ -95,7 +96,9 @@ pub async fn handshake( if !read_buf.is_empty() { return Err(HandshakeError::EarlyData); } - let tls_stream = raw.upgrade(tls.to_server_config()).await?; + let tls_stream = raw + .upgrade(tls.to_server_config(), record_handshake_error) + .await?; let (_, tls_server_end_point) = tls .cert_resolver diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index f6d4314391..62de79946f 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -2,11 +2,10 @@ use crate::{ cancellation, compute::PostgresConnection, console::messages::MetricsAuxInfo, - metrics::NUM_BYTES_PROXIED_COUNTER, + metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}, stream::Stream, - usage_metrics::{Ids, USAGE_METRICS}, + usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}, }; -use metrics::IntCounterPairGuard; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; use utils::measured_stream::MeasuredStream; @@ -19,28 +18,29 @@ pub async fn proxy_pass( aux: MetricsAuxInfo, ) -> anyhow::Result<()> { let usage = USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id.clone(), - branch_id: aux.branch_id.clone(), + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, }); - let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]); + let metrics = &Metrics::get().proxy.io_bytes; + let m_sent = metrics.with_labels(Direction::Tx); let mut client = MeasuredStream::new( client, |_| {}, |cnt| { // Number of bytes we sent to the client (outbound). - m_sent.inc_by(cnt as u64); + metrics.get_metric(m_sent).inc_by(cnt as u64); usage.record_egress(cnt as u64); }, ); - let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]); + let m_recv = metrics.with_labels(Direction::Rx); let mut compute = MeasuredStream::new( compute, |_| {}, |cnt| { // Number of bytes the client sent to the compute node (inbound). - m_recv.inc_by(cnt as u64); + metrics.get_metric(m_recv).inc_by(cnt as u64); }, ); @@ -60,8 +60,8 @@ pub struct ProxyPassthrough { pub compute: PostgresConnection, pub aux: MetricsAuxInfo, - pub req: IntCounterPairGuard, - pub conn: IntCounterPairGuard, + pub req: NumConnectionRequestsGuard<'static>, + pub conn: NumClientConnectionsGuard<'static>, pub cancel: cancellation::Session

, } diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index a4051447c1..849e9bd33c 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -12,11 +12,12 @@ use crate::auth::backend::{ }; use crate::config::CertResolver; use crate::console::caches::NodeInfoCache; +use crate::console::messages::MetricsAuxInfo; use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend}; use crate::console::{self, CachedNodeInfo, NodeInfo}; use crate::error::ErrorKind; use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT}; -use crate::{http, sasl, scram}; +use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId}; use anyhow::{bail, Context}; use async_trait::async_trait; use rstest::rstest; @@ -174,7 +175,7 @@ async fn dummy_proxy( auth: impl TestAuth + Send, ) -> anyhow::Result<()> { let client = WithClientIp::new(client); - let mut stream = match handshake(client, tls.as_ref()).await? { + let mut stream = match handshake(client, tls.as_ref(), false).await? { HandshakeData::Startup(stream, _) => stream, HandshakeData::Cancel(_) => bail!("cancellation not supported"), }; @@ -512,7 +513,12 @@ impl TestBackend for TestConnectMechanism { fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { let node = NodeInfo { config: compute::ConnCfg::new(), - aux: Default::default(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, allow_self_signed_compute: false, }; let (_, node) = cache.insert("key".into(), node); diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index 3b760e5dab..cbfc9f1358 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -34,7 +34,10 @@ async fn proxy_mitm( tokio::spawn(async move { // begin handshake with end_server let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await; - let (end_client, startup) = match handshake(client1, Some(&server_config1)).await.unwrap() { + let (end_client, startup) = match handshake(client1, Some(&server_config1), false) + .await + .unwrap() + { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(_) => panic!("cancellation not supported"), }; diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index bfe4b7ec3a..f8154b1a94 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,6 +1,6 @@ use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo}; use crate::context::RequestMonitoring; -use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES}; +use crate::metrics::{ConnectionFailuresBreakdownGroup, Metrics, WakeupFailureKind}; use crate::proxy::retry::retry_after; use hyper::StatusCode; use std::ops::ControlFlow; @@ -57,39 +57,46 @@ pub fn handle_try_wake( fn report_error(e: &WakeComputeError, retry: bool) { use crate::console::errors::ApiError; - let retry = bool_to_str(retry); let kind = match e { - WakeComputeError::BadComputeAddress(_) => "bad_compute_address", - WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error", + WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress, + WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError, WakeComputeError::ApiError(ApiError::Console { status: StatusCode::LOCKED, ref text, }) if text.contains("written data quota exceeded") || text.contains("the limit for current plan reached") => { - "quota_exceeded" + WakeupFailureKind::QuotaExceeded } WakeComputeError::ApiError(ApiError::Console { status: StatusCode::UNPROCESSABLE_ENTITY, ref text, }) if text.contains("compute time quota of non-primary branches is exceeded") => { - "quota_exceeded" + WakeupFailureKind::QuotaExceeded } WakeComputeError::ApiError(ApiError::Console { status: StatusCode::LOCKED, .. - }) => "api_console_locked", + }) => WakeupFailureKind::ApiConsoleLocked, WakeComputeError::ApiError(ApiError::Console { status: StatusCode::BAD_REQUEST, .. - }) => "api_console_bad_request", + }) => WakeupFailureKind::ApiConsoleBadRequest, WakeComputeError::ApiError(ApiError::Console { status, .. }) if status.is_server_error() => { - "api_console_other_server_error" + WakeupFailureKind::ApiConsoleOtherServerError } - WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error", - WakeComputeError::TimeoutError => "timeout_error", + WakeComputeError::ApiError(ApiError::Console { .. }) => { + WakeupFailureKind::ApiConsoleOtherError + } + WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError, }; - NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc(); + Metrics::get() + .proxy + .connection_failures_breakdown + .inc(ConnectionFailuresBreakdownGroup { + kind, + retry: retry.into(), + }); } diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index 13dffffca0..c542267547 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -1,7 +1,2 @@ -mod aimd; -mod limit_algorithm; mod limiter; -pub use aimd::Aimd; -pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig}; -pub use limiter::Limiter; -pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter}; +pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo}; diff --git a/proxy/src/rate_limiter/aimd.rs b/proxy/src/rate_limiter/aimd.rs deleted file mode 100644 index 2c14a54a6c..0000000000 --- a/proxy/src/rate_limiter/aimd.rs +++ /dev/null @@ -1,166 +0,0 @@ -use std::usize; - -use async_trait::async_trait; - -use super::limit_algorithm::{AimdConfig, LimitAlgorithm, Sample}; - -use super::limiter::Outcome; - -/// Loss-based congestion avoidance. -/// -/// Additive-increase, multiplicative decrease. -/// -/// Adds available currency when: -/// 1. no load-based errors are observed, and -/// 2. the utilisation of the current limit is high. -/// -/// Reduces available concurrency by a factor when load-based errors are detected. -pub struct Aimd { - min_limit: usize, - max_limit: usize, - decrease_factor: f32, - increase_by: usize, - min_utilisation_threshold: f32, -} - -impl Aimd { - pub fn new(config: AimdConfig) -> Self { - Self { - min_limit: config.aimd_min_limit, - max_limit: config.aimd_max_limit, - decrease_factor: config.aimd_decrease_factor, - increase_by: config.aimd_increase_by, - min_utilisation_threshold: config.aimd_min_utilisation_threshold, - } - } -} - -#[async_trait] -impl LimitAlgorithm for Aimd { - async fn update(&mut self, old_limit: usize, sample: Sample) -> usize { - use Outcome::*; - match sample.outcome { - Success => { - let utilisation = sample.in_flight as f32 / old_limit as f32; - - if utilisation > self.min_utilisation_threshold { - let limit = old_limit + self.increase_by; - limit.clamp(self.min_limit, self.max_limit) - } else { - old_limit - } - } - Overload => { - let limit = old_limit as f32 * self.decrease_factor; - - // Floor instead of round, so the limit reduces even with small numbers. - // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1 - let limit = limit.floor() as usize; - - limit.clamp(self.min_limit, self.max_limit) - } - } - } -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use tokio::sync::Notify; - - use super::*; - - use crate::rate_limiter::{Limiter, RateLimiterConfig}; - - #[tokio::test] - async fn should_decrease_limit_on_overload() { - let config = RateLimiterConfig { - initial_limit: 10, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let release_notifier = Arc::new(Notify::new()); - - let limiter = Limiter::new(config).with_release_notifier(release_notifier.clone()); - - let token = limiter.try_acquire().unwrap(); - limiter.release(token, Some(Outcome::Overload)).await; - release_notifier.notified().await; - assert_eq!(limiter.state().limit(), 5, "overload: decrease"); - } - - #[tokio::test] - async fn should_increase_limit_on_success_when_using_gt_util_threshold() { - let config = RateLimiterConfig { - initial_limit: 4, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - aimd_increase_by: 1, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - let _token = limiter.try_acquire().unwrap(); - let _token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - assert_eq!(limiter.state().limit(), 5, "success: increase"); - } - - #[tokio::test] - async fn should_not_change_limit_on_success_when_using_lt_util_threshold() { - let config = RateLimiterConfig { - initial_limit: 4, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - assert_eq!( - limiter.state().limit(), - 4, - "success: ignore when < half limit" - ); - } - - #[tokio::test] - async fn should_not_change_limit_when_no_outcome() { - let config = RateLimiterConfig { - initial_limit: 10, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - limiter.release(token, None).await; - assert_eq!(limiter.state().limit(), 10, "ignore"); - } -} diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs deleted file mode 100644 index 5cd2d5ebb7..0000000000 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ /dev/null @@ -1,98 +0,0 @@ -//! Algorithms for controlling concurrency limits. -use async_trait::async_trait; -use std::time::Duration; - -use super::{limiter::Outcome, Aimd}; - -/// An algorithm for controlling a concurrency limit. -#[async_trait] -pub trait LimitAlgorithm: Send + Sync + 'static { - /// Update the concurrency limit in response to a new job completion. - async fn update(&mut self, old_limit: usize, sample: Sample) -> usize; -} - -/// The result of a job (or jobs), including the [Outcome] (loss) and latency (delay). -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Sample { - pub(crate) latency: Duration, - /// Jobs in flight when the sample was taken. - pub(crate) in_flight: usize, - pub(crate) outcome: Outcome, -} - -#[derive(Clone, Copy, Debug, Default, clap::ValueEnum)] -pub enum RateLimitAlgorithm { - Fixed, - #[default] - Aimd, -} - -pub struct Fixed; - -#[async_trait] -impl LimitAlgorithm for Fixed { - async fn update(&mut self, old_limit: usize, _sample: Sample) -> usize { - old_limit - } -} - -#[derive(Clone, Copy, Debug)] -pub struct RateLimiterConfig { - pub disable: bool, - pub algorithm: RateLimitAlgorithm, - pub timeout: Duration, - pub initial_limit: usize, - pub aimd_config: Option, -} - -impl RateLimiterConfig { - pub fn create_rate_limit_algorithm(self) -> Box { - match self.algorithm { - RateLimitAlgorithm::Fixed => Box::new(Fixed), - RateLimitAlgorithm::Aimd => Box::new(Aimd::new(self.aimd_config.unwrap())), // For aimd algorithm config is mandatory. - } - } -} - -impl Default for RateLimiterConfig { - fn default() -> Self { - Self { - disable: true, - algorithm: RateLimitAlgorithm::Aimd, - timeout: Duration::from_secs(1), - initial_limit: 100, - aimd_config: Some(AimdConfig::default()), - } - } -} - -#[derive(clap::Parser, Clone, Copy, Debug)] -pub struct AimdConfig { - /// Minimum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 1)] - pub aimd_min_limit: usize, - /// Maximum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 1500)] - pub aimd_max_limit: usize, - /// Increase AIMD increase by value in case of success. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 10)] - pub aimd_increase_by: usize, - /// Decrease AIMD decrease by value in case of timout/429. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 0.9)] - pub aimd_decrease_factor: f32, - /// A threshold below which the limit won't be increased. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 0.8)] - pub aimd_min_utilisation_threshold: f32, -} - -impl Default for AimdConfig { - fn default() -> Self { - Self { - aimd_min_limit: 1, - aimd_max_limit: 1500, - aimd_increase_by: 10, - aimd_decrease_factor: 0.9, - aimd_min_utilisation_threshold: 0.8, - } - } -} diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index f590896dd9..3796b22ae9 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -2,10 +2,9 @@ use std::{ borrow::Cow, collections::hash_map::RandomState, hash::{BuildHasher, Hash}, - net::IpAddr, sync::{ atomic::{AtomicUsize, Ordering}, - Arc, Mutex, + Mutex, }, }; @@ -13,24 +12,18 @@ use anyhow::bail; use dashmap::DashMap; use itertools::Itertools; use rand::{rngs::StdRng, Rng, SeedableRng}; -use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit}; -use tokio::time::{timeout, Duration, Instant}; +use tokio::time::{Duration, Instant}; use tracing::info; -use crate::{intern::EndpointIdInt, EndpointId}; +use crate::EndpointId; -use super::{ - limit_algorithm::{LimitAlgorithm, Sample}, - RateLimiterConfig, -}; - -pub struct RedisRateLimiter { +pub struct GlobalRateLimiter { data: Vec, - info: &'static [RateBucketInfo], + info: Vec, } -impl RedisRateLimiter { - pub fn new(info: &'static [RateBucketInfo]) -> Self { +impl GlobalRateLimiter { + pub fn new(info: Vec) -> Self { Self { data: vec![ RateBucket { @@ -50,7 +43,7 @@ impl RedisRateLimiter { let should_allow_request = self .data .iter_mut() - .zip(self.info) + .zip(&self.info) .all(|(bucket, info)| bucket.should_allow_request(info, now, 1)); if should_allow_request { @@ -75,9 +68,6 @@ impl RedisRateLimiter { // I went with a more expensive way that yields user-friendlier error messages. pub type EndpointRateLimiter = BucketRateLimiter; -// This can't be just per IP because that would limit some PaaS that share IP addresses -pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, IpAddr), StdRng, RandomState>; - pub struct BucketRateLimiter { map: DashMap, Hasher>, info: Cow<'static, [RateBucketInfo]>, @@ -149,19 +139,6 @@ impl RateBucketInfo { Self::new(100, Duration::from_secs(600)), ]; - /// All of these are per endpoint-ip pair. - /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). - /// - /// First bucket: 300mcpus total per endpoint-ip pair - /// * 1228800 requests per second with 1 hash rounds. (endpoint rate limiter will catch this first) - /// * 300 requests per second with 4096 hash rounds. - /// * 2 requests per second with 600000 hash rounds. - pub const DEFAULT_AUTH_SET: [Self; 3] = [ - Self::new(300 * 4096, Duration::from_secs(1)), - Self::new(200 * 4096, Duration::from_secs(60)), - Self::new(100 * 4096, Duration::from_secs(600)), - ]; - pub fn validate(info: &mut [Self]) -> anyhow::Result<()> { info.sort_unstable_by_key(|info| info.interval); let invalid = info @@ -259,419 +236,16 @@ impl BucketRateLimiter { } } -/// Limits the number of concurrent jobs. -/// -/// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the -/// token once the job is finished. -/// -/// The limit will be automatically adjusted based on observed latency (delay) and/or failures -/// caused by overload (loss). -pub struct Limiter { - limit_algo: AsyncMutex>, - semaphore: std::sync::Arc, - config: RateLimiterConfig, - - // ONLY WRITE WHEN LIMIT_ALGO IS LOCKED - limits: AtomicUsize, - - // ONLY USE ATOMIC ADD/SUB - in_flight: Arc, - - #[cfg(test)] - notifier: Option>, -} - -/// A concurrency token, required to run a job. -/// -/// Release the token back to the [Limiter] after the job is complete. -#[derive(Debug)] -pub struct Token<'t> { - permit: Option>, - start: Instant, - in_flight: Arc, -} - -/// A snapshot of the state of the [Limiter]. -/// -/// Not guaranteed to be consistent under high concurrency. -#[derive(Debug, Clone, Copy)] -pub struct LimiterState { - limit: usize, - in_flight: usize, -} - -/// Whether a job succeeded or failed as a result of congestion/overload. -/// -/// Errors not considered to be caused by overload should be ignored. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Outcome { - /// The job succeeded, or failed in a way unrelated to overload. - Success, - /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal - /// was observed. - Overload, -} - -impl Outcome { - fn from_reqwest_error(error: &reqwest_middleware::Error) -> Self { - match error { - reqwest_middleware::Error::Middleware(_) => Outcome::Success, - reqwest_middleware::Error::Reqwest(e) => { - if let Some(status) = e.status() { - if status.is_server_error() - || reqwest::StatusCode::TOO_MANY_REQUESTS.as_u16() == status - { - Outcome::Overload - } else { - Outcome::Success - } - } else { - Outcome::Success - } - } - } - } - fn from_reqwest_response(response: &reqwest::Response) -> Self { - if response.status().is_server_error() - || response.status() == reqwest::StatusCode::TOO_MANY_REQUESTS - { - Outcome::Overload - } else { - Outcome::Success - } - } -} - -impl Limiter { - /// Create a limiter with a given limit control algorithm. - pub fn new(config: RateLimiterConfig) -> Self { - assert!(config.initial_limit > 0); - Self { - limit_algo: AsyncMutex::new(config.create_rate_limit_algorithm()), - semaphore: Arc::new(Semaphore::new(config.initial_limit)), - config, - limits: AtomicUsize::new(config.initial_limit), - in_flight: Arc::new(AtomicUsize::new(0)), - #[cfg(test)] - notifier: None, - } - } - // pub fn new(limit_algorithm: T, timeout: Duration, initial_limit: usize) -> Self { - // assert!(initial_limit > 0); - - // Self { - // limit_algo: AsyncMutex::new(limit_algorithm), - // semaphore: Arc::new(Semaphore::new(initial_limit)), - // timeout, - // limits: AtomicUsize::new(initial_limit), - // in_flight: Arc::new(AtomicUsize::new(0)), - // #[cfg(test)] - // notifier: None, - // } - // } - - /// In some cases [Token]s are acquired asynchronously when updating the limit. - #[cfg(test)] - pub fn with_release_notifier(mut self, n: std::sync::Arc) -> Self { - self.notifier = Some(n); - self - } - - /// Try to immediately acquire a concurrency [Token]. - /// - /// Returns `None` if there are none available. - pub fn try_acquire(&self) -> Option { - let result = if self.config.disable { - // If the rate limiter is disabled, we can always acquire a token. - Some(Token::new(None, self.in_flight.clone())) - } else { - self.semaphore - .try_acquire() - .map(|permit| Token::new(Some(permit), self.in_flight.clone())) - .ok() - }; - if result.is_some() { - self.in_flight.fetch_add(1, Ordering::AcqRel); - } - result - } - - /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available. - /// - /// Returns `None` if there are none available after `duration`. - pub async fn acquire_timeout(&self, duration: Duration) -> Option> { - info!("acquiring token: {:?}", self.semaphore.available_permits()); - let result = if self.config.disable { - // If the rate limiter is disabled, we can always acquire a token. - Some(Token::new(None, self.in_flight.clone())) - } else { - match timeout(duration, self.semaphore.acquire()).await { - Ok(maybe_permit) => maybe_permit - .map(|permit| Token::new(Some(permit), self.in_flight.clone())) - .ok(), - Err(_) => None, - } - }; - if result.is_some() { - self.in_flight.fetch_add(1, Ordering::AcqRel); - } - result - } - - /// Return the concurrency [Token], along with the outcome of the job. - /// - /// The [Outcome] of the job, and the time taken to perform it, may be used - /// to update the concurrency limit. - /// - /// Set the outcome to `None` to ignore the job. - pub async fn release(&self, mut token: Token<'_>, outcome: Option) { - tracing::info!("outcome is {:?}", outcome); - let in_flight = self.in_flight.load(Ordering::Acquire); - let old_limit = self.limits.load(Ordering::Acquire); - let available = if self.config.disable { - 0 // This is not used in the algorithm and can be anything. If the config disable it makes sense to set it to 0. - } else { - self.semaphore.available_permits() - }; - let total = in_flight + available; - - let mut algo = self.limit_algo.lock().await; - - let new_limit = if let Some(outcome) = outcome { - let sample = Sample { - latency: token.start.elapsed(), - in_flight, - outcome, - }; - algo.update(old_limit, sample).await - } else { - old_limit - }; - tracing::info!("new limit is {}", new_limit); - let actual_limit = if new_limit < total { - token.forget(); - total.saturating_sub(1) - } else { - if !self.config.disable { - self.semaphore.add_permits(new_limit.saturating_sub(total)); - } - new_limit - }; - crate::metrics::RATE_LIMITER_LIMIT - .with_label_values(&["expected"]) - .set(new_limit as i64); - crate::metrics::RATE_LIMITER_LIMIT - .with_label_values(&["actual"]) - .set(actual_limit as i64); - self.limits.store(new_limit, Ordering::Release); - #[cfg(test)] - if let Some(n) = &self.notifier { - n.notify_one(); - } - } - - /// The current state of the limiter. - pub fn state(&self) -> LimiterState { - let limit = self.limits.load(Ordering::Relaxed); - let in_flight = self.in_flight.load(Ordering::Relaxed); - LimiterState { limit, in_flight } - } -} - -impl<'t> Token<'t> { - fn new(permit: Option>, in_flight: Arc) -> Self { - Self { - permit, - start: Instant::now(), - in_flight, - } - } - - pub fn forget(&mut self) { - if let Some(permit) = self.permit.take() { - permit.forget(); - } - } -} - -impl Drop for Token<'_> { - fn drop(&mut self) { - self.in_flight.fetch_sub(1, Ordering::AcqRel); - } -} - -impl LimiterState { - /// The current concurrency limit. - pub fn limit(&self) -> usize { - self.limit - } - /// The number of jobs in flight. - pub fn in_flight(&self) -> usize { - self.in_flight - } -} - -#[async_trait::async_trait] -impl reqwest_middleware::Middleware for Limiter { - async fn handle( - &self, - req: reqwest::Request, - extensions: &mut task_local_extensions::Extensions, - next: reqwest_middleware::Next<'_>, - ) -> reqwest_middleware::Result { - let start = Instant::now(); - let token = self - .acquire_timeout(self.config.timeout) - .await - .ok_or_else(|| { - reqwest_middleware::Error::Middleware( - // TODO: Should we map it into user facing errors? - crate::console::errors::ApiError::Console { - status: crate::http::StatusCode::TOO_MANY_REQUESTS, - text: "Too many requests".into(), - } - .into(), - ) - })?; - info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane"); - crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64()); - match next.run(req, extensions).await { - Ok(response) => { - self.release(token, Some(Outcome::from_reqwest_response(&response))) - .await; - Ok(response) - } - Err(e) => { - self.release(token, Some(Outcome::from_reqwest_error(&e))) - .await; - Err(e) - } - } - } -} - #[cfg(test)] mod tests { - use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration}; + use std::{hash::BuildHasherDefault, time::Duration}; - use futures::{task::noop_waker_ref, Future}; use rand::SeedableRng; use rustc_hash::FxHasher; use tokio::time; - use super::{BucketRateLimiter, EndpointRateLimiter, Limiter, Outcome}; - use crate::{ - rate_limiter::{RateBucketInfo, RateLimitAlgorithm}, - EndpointId, - }; - - #[tokio::test] - async fn it_works() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 10, - disable: false, - ..Default::default() - }; - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - - assert_eq!(limiter.state().limit(), 10); - } - - #[tokio::test] - async fn is_fair() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 1, - disable: false, - ..Default::default() - }; - let limiter = Limiter::new(config); - - // === TOKEN 1 === - let token1 = limiter.try_acquire().unwrap(); - - let mut token2_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1))); - assert!( - token2_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token1" - ); - - let mut token3_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1))); - assert!( - token3_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token1" - ); - - limiter.release(token1, Some(Outcome::Success)).await; - // === END TOKEN 1 === - - // === TOKEN 2 === - assert!( - limiter.try_acquire().is_none(), - "token is acquired by token2" - ); - - assert!( - token3_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token2" - ); - - let token2 = token2_fut.await.unwrap(); - - limiter.release(token2, Some(Outcome::Success)).await; - // === END TOKEN 2 === - - // === TOKEN 3 === - assert!( - limiter.try_acquire().is_none(), - "token is acquired by token3" - ); - - let token3 = token3_fut.await.unwrap(); - limiter.release(token3, Some(Outcome::Success)).await; - // === END TOKEN 3 === - - // === TOKEN 4 === - let token4 = limiter.try_acquire().unwrap(); - limiter.release(token4, Some(Outcome::Success)).await; - } - - #[tokio::test] - async fn disable() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 1, - disable: true, - ..Default::default() - }; - let limiter = Limiter::new(config); - - // === TOKEN 1 === - let token1 = limiter.try_acquire().unwrap(); - let token2 = limiter.try_acquire().unwrap(); - let state = limiter.state(); - assert_eq!(state.limit(), 1); - assert_eq!(state.in_flight(), 2); // For disabled limiter, it's expected. - limiter.release(token1, None).await; - limiter.release(token2, None).await; - } + use super::{BucketRateLimiter, EndpointRateLimiter}; + use crate::{rate_limiter::RateBucketInfo, EndpointId}; #[test] fn rate_bucket_rpi() { @@ -773,31 +347,4 @@ mod tests { } assert!(limiter.map.len() < 150_000); } - - #[test] - fn test_default_auth_set() { - // these values used to exceed u32::MAX - assert_eq!( - RateBucketInfo::DEFAULT_AUTH_SET, - [ - RateBucketInfo { - interval: Duration::from_secs(1), - max_rpi: 300 * 4096, - }, - RateBucketInfo { - interval: Duration::from_secs(60), - max_rpi: 200 * 4096 * 60, - }, - RateBucketInfo { - interval: Duration::from_secs(600), - max_rpi: 100 * 4096 * 600, - } - ] - ); - - for x in RateBucketInfo::DEFAULT_AUTH_SET { - let y = x.to_string().parse().unwrap(); - assert_eq!(x, y); - } - } } diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 422789813c..7baf104374 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -5,7 +5,7 @@ use redis::AsyncCommands; use tokio::sync::Mutex; use uuid::Uuid; -use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter}; +use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; use super::{ connection_with_credentials_provider::ConnectionWithCredentialsProvider, @@ -80,7 +80,7 @@ impl CancellationPublisher for Arc> { pub struct RedisPublisherClient { client: ConnectionWithCredentialsProvider, region_id: String, - limiter: RedisRateLimiter, + limiter: GlobalRateLimiter, } impl RedisPublisherClient { @@ -92,7 +92,7 @@ impl RedisPublisherClient { Ok(Self { client, region_id, - limiter: RedisRateLimiter::new(info), + limiter: GlobalRateLimiter::new(info.into()), }) } diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index d183abb53a..3a90d911c2 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -77,10 +77,14 @@ impl ConnectionWithCredentialsProvider { } } + async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> { + redis::cmd("PING").query_async(con).await + } + pub async fn connect(&mut self) -> anyhow::Result<()> { let _guard = self.mutex.lock().await; if let Some(con) = self.con.as_mut() { - match redis::cmd("PING").query_async(con).await { + match Self::ping(con).await { Ok(()) => { return Ok(()); } @@ -96,7 +100,7 @@ impl ConnectionWithCredentialsProvider { if let Some(f) = self.refresh_token_task.take() { f.abort() } - let con = self + let mut con = self .get_client() .await? .get_multiplexed_tokio_connection() @@ -109,6 +113,14 @@ impl ConnectionWithCredentialsProvider { }); self.refresh_token_task = Some(f); } + match Self::ping(&mut con).await { + Ok(()) => { + info!("Connection succesfully established"); + } + Err(e) => { + error!("Connection is broken. Error during PING: {e:?}"); + } + } self.con = Some(con); Ok(()) } diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 8b7e3e3419..5a38530faf 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -11,7 +11,7 @@ use crate::{ cache::project_info::ProjectInfoCache, cancellation::{CancelMap, CancellationHandler}, intern::{ProjectIdInt, RoleNameInt}, - metrics::{NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, REDIS_BROKEN_MESSAGES}, + metrics::{Metrics, RedisErrors}, }; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; @@ -104,9 +104,9 @@ impl MessageHandler { let msg: Notification = match serde_json::from_str(&payload) { Ok(msg) => msg, Err(e) => { - REDIS_BROKEN_MESSAGES - .with_label_values(&[msg.get_channel_name()]) - .inc(); + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: msg.get_channel_name(), + }); tracing::error!("broken message: {e}"); return Ok(()); } @@ -183,7 +183,7 @@ where cache, Arc::new(CancellationHandler::<()>::new( cancel_map, - NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, + crate::metrics::CancellationSource::FromRedis, )), region_id, ); diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index a2010fd613..f3c42cdb01 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -4,42 +4,48 @@ mod backend; mod conn_pool; +mod http_util; mod json; mod sql_over_http; -pub mod tls_listener; mod websocket; +use atomic_take::AtomicTake; +use bytes::Bytes; pub use conn_pool::GlobalConnPoolOptions; -use anyhow::bail; -use hyper::StatusCode; -use metrics::IntCounterPairGuard; +use anyhow::Context; +use futures::future::{select, Either}; +use futures::TryFutureExt; +use http::{Method, Response, StatusCode}; +use http_body_util::Full; +use hyper1::body::Incoming; +use hyper_util::rt::TokioExecutor; +use hyper_util::server::conn::auto::Builder; use rand::rngs::StdRng; use rand::SeedableRng; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; +use tokio::time::timeout; +use tokio_rustls::TlsAcceptor; use tokio_util::task::TaskTracker; -use tracing::instrument::Instrumented; use crate::cancellation::CancellationHandlerMain; use crate::config::ProxyConfig; use crate::context::RequestMonitoring; -use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard}; +use crate::metrics::Metrics; +use crate::protocol2::WithClientIp; +use crate::proxy::run_until_cancelled; use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; -use hyper::{ - server::conn::{AddrIncoming, AddrStream}, - Body, Method, Request, Response, -}; +use crate::serverless::http_util::{api_error_into_response, json_response}; -use std::net::IpAddr; +use std::net::{IpAddr, SocketAddr}; +use std::pin::pin; use std::sync::Arc; -use std::task::Poll; -use tls_listener::TlsListener; -use tokio::net::TcpListener; -use tokio_util::sync::{CancellationToken, DropGuard}; +use tokio::net::{TcpListener, TcpStream}; +use tokio_util::sync::CancellationToken; use tracing::{error, info, warn, Instrument}; -use utils::http::{error::ApiError, json::json_response}; +use utils::http::error::ApiError; pub const SERVERLESS_DRIVER_SNI: &str = "api"; @@ -91,161 +97,183 @@ pub async fn task_main( tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into(); - let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?; - let _ = addr_incoming.set_nodelay(true); - let addr_incoming = ProxyProtocolAccept { - incoming: addr_incoming, - protocol: "http", - }; + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + connections.close(); // allows `connections.wait to complete` - let ws_connections = tokio_util::task::task_tracker::TaskTracker::new(); - ws_connections.close(); // allows `ws_connections.wait to complete` + let server = Builder::new(hyper_util::rt::TokioExecutor::new()); - let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout); + while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await { + let (conn, peer_addr) = res.context("could not accept TCP stream")?; + if let Err(e) = conn.set_nodelay(true) { + tracing::error!("could not set nodelay: {e}"); + continue; + } + let conn_id = uuid::Uuid::new_v4(); + let http_conn_span = tracing::info_span!("http_conn", ?conn_id); - let make_svc = hyper::service::make_service_fn( - |stream: &tokio_rustls::server::TlsStream< - WithConnectionGuard>, - >| { - let (conn, _) = stream.get_ref(); + connections.spawn( + connection_handler( + config, + backend.clone(), + connections.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + cancellation_token.clone(), + server.clone(), + tls_acceptor.clone(), + conn, + peer_addr, + ) + .instrument(http_conn_span), + ); + } - // this is jank. should dissapear with hyper 1.0 migration. - let gauge = conn - .gauge - .lock() - .expect("lock should not be poisoned") - .take() - .expect("gauge should be set on connection start"); - - // Cancel all current inflight HTTP requests if the HTTP connection is closed. - let http_cancellation_token = CancellationToken::new(); - let cancel_connection = http_cancellation_token.clone().drop_guard(); - - let span = conn.span.clone(); - let client_addr = conn.inner.client_addr(); - let remote_addr = conn.inner.inner.remote_addr(); - let backend = backend.clone(); - let ws_connections = ws_connections.clone(); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - let cancellation_handler = cancellation_handler.clone(); - async move { - let peer_addr = match client_addr { - Some(addr) => addr, - None if config.require_client_ip => bail!("missing required client ip"), - None => remote_addr, - }; - Ok(MetricService::new( - hyper::service::service_fn(move |req: Request| { - let backend = backend.clone(); - let ws_connections2 = ws_connections.clone(); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - let cancellation_handler = cancellation_handler.clone(); - let http_cancellation_token = http_cancellation_token.child_token(); - - // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. - // By spawning the future, we ensure it never gets cancelled until it decides to. - ws_connections.spawn( - async move { - // Cancel the current inflight HTTP request if the requets stream is closed. - // This is slightly different to `_cancel_connection` in that - // h2 can cancel individual requests with a `RST_STREAM`. - let _cancel_session = http_cancellation_token.clone().drop_guard(); - - let res = request_handler( - req, - config, - backend, - ws_connections2, - cancellation_handler, - peer_addr.ip(), - endpoint_rate_limiter, - http_cancellation_token, - ) - .await - .map_or_else(|e| e.into_response(), |r| r); - - _cancel_session.disarm(); - - res - } - .in_current_span(), - ) - }), - gauge, - cancel_connection, - span, - )) - } - }, - ); - - hyper::Server::builder(tls_listener) - .serve(make_svc) - .with_graceful_shutdown(cancellation_token.cancelled()) - .await?; - - // await websocket connections - ws_connections.wait().await; + connections.wait().await; Ok(()) } -struct MetricService { - inner: S, - _gauge: IntCounterPairGuard, - _cancel: DropGuard, - span: tracing::Span, -} +/// Handles the TCP lifecycle. +/// +/// 1. Parses PROXY protocol V2 +/// 2. Handles TLS handshake +/// 3. Handles HTTP connection +/// 1. With graceful shutdowns +/// 2. With graceful request cancellation with connection failure +/// 3. With websocket upgrade support. +#[allow(clippy::too_many_arguments)] +async fn connection_handler( + config: &'static ProxyConfig, + backend: Arc, + connections: TaskTracker, + cancellation_handler: Arc, + endpoint_rate_limiter: Arc, + cancellation_token: CancellationToken, + server: Builder, + tls_acceptor: TlsAcceptor, + conn: TcpStream, + peer_addr: SocketAddr, +) { + let session_id = uuid::Uuid::new_v4(); -impl MetricService { - fn new( - inner: S, - _gauge: IntCounterPairGuard, - _cancel: DropGuard, - span: tracing::Span, - ) -> MetricService { - MetricService { - inner, - _gauge, - _cancel, - span, + let _gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Http); + + // handle PROXY protocol + let mut conn = WithClientIp::new(conn); + let peer = match conn.wait_for_addr().await { + Ok(peer) => peer, + Err(e) => { + tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); + return; } - } -} + }; -impl hyper::service::Service> for MetricService -where - S: hyper::service::Service>, -{ - type Response = S::Response; - type Error = S::Error; - type Future = Instrumented; + let peer_addr = peer.unwrap_or(peer_addr).ip(); + let has_private_peer_addr = match peer_addr { + IpAddr::V4(ip) => ip.is_private(), + _ => false, + }; + info!(?session_id, %peer_addr, "accepted new TCP connection"); - fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { - self.inner.poll_ready(cx) - } + // try upgrade to TLS, but with a timeout. + let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await { + Ok(Ok(conn)) => { + info!(?session_id, %peer_addr, "accepted new TLS connection"); + conn + } + // The handshake failed + Ok(Err(e)) => { + if !has_private_peer_addr { + Metrics::get().proxy.tls_handshake_failures.inc(); + } + warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); + return; + } + // The handshake timed out + Err(e) => { + if !has_private_peer_addr { + Metrics::get().proxy.tls_handshake_failures.inc(); + } + warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); + return; + } + }; - fn call(&mut self, req: Request) -> Self::Future { - self.span - .in_scope(|| self.inner.call(req)) - .instrument(self.span.clone()) + let session_id = AtomicTake::new(session_id); + + // Cancel all current inflight HTTP requests if the HTTP connection is closed. + let http_cancellation_token = CancellationToken::new(); + let _cancel_connection = http_cancellation_token.clone().drop_guard(); + + let conn = server.serve_connection_with_upgrades( + hyper_util::rt::TokioIo::new(conn), + hyper1::service::service_fn(move |req: hyper1::Request| { + // First HTTP request shares the same session ID + let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4); + + // Cancel the current inflight HTTP request if the requets stream is closed. + // This is slightly different to `_cancel_connection` in that + // h2 can cancel individual requests with a `RST_STREAM`. + let http_request_token = http_cancellation_token.child_token(); + let cancel_request = http_request_token.clone().drop_guard(); + + // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. + // By spawning the future, we ensure it never gets cancelled until it decides to. + let handler = connections.spawn( + request_handler( + req, + config, + backend.clone(), + connections.clone(), + cancellation_handler.clone(), + session_id, + peer_addr, + endpoint_rate_limiter.clone(), + http_request_token, + ) + .in_current_span() + .map_ok_or_else(api_error_into_response, |r| r), + ); + + async move { + let res = handler.await; + cancel_request.disarm(); + res + } + }), + ); + + // On cancellation, trigger the HTTP connection handler to shut down. + let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await { + Either::Left((_cancelled, mut conn)) => { + conn.as_mut().graceful_shutdown(); + conn.await + } + Either::Right((res, _)) => res, + }; + + match res { + Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"), + Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"), } } #[allow(clippy::too_many_arguments)] async fn request_handler( - mut request: Request, + mut request: hyper1::Request, config: &'static ProxyConfig, backend: Arc, ws_connections: TaskTracker, cancellation_handler: Arc, + session_id: uuid::Uuid, peer_addr: IpAddr, endpoint_rate_limiter: Arc, // used to cancel in-flight HTTP requests. not used to cancel websockets http_cancellation_token: CancellationToken, -) -> Result, ApiError> { - let session_id = uuid::Uuid::new_v4(); - +) -> Result>, ApiError> { let host = request .headers() .get("host") @@ -255,7 +283,13 @@ async fn request_handler( // Check if the request is a websocket upgrade request. if hyper_tungstenite::is_upgrade_request(&request) { - let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region); + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Ws, + &config.region, + ); + let span = ctx.span.clone(); info!(parent: &span, "performing websocket upgrade"); @@ -282,14 +316,19 @@ async fn request_handler( // Return the response so the spawned future can continue. Ok(response) - } else if request.uri().path() == "/sql" && request.method() == Method::POST { - let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region); + } else if request.uri().path() == "/sql" && *request.method() == Method::POST { + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Http, + &config.region, + ); let span = ctx.span.clone(); sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) .instrument(span) .await - } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS { + } else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS { Response::builder() .header("Allow", "OPTIONS, POST") .header("Access-Control-Allow-Origin", "*") @@ -299,7 +338,7 @@ async fn request_handler( ) .header("Access-Control-Max-Age", "86400" /* 24 hours */) .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code - .body(Body::empty()) + .body(Full::new(Bytes::new())) .map_err(|e| ApiError::InternalServerError(e.into())) } else { json_response(StatusCode::BAD_REQUEST, "query is not supported") diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index f10779d7ba..e74c63599a 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -6,10 +6,9 @@ use tracing::{field::display, info}; use crate::{ auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError}, compute, - config::ProxyConfig, + config::{AuthenticationConfig, ProxyConfig}, console::{ errors::{GetAuthInfoError, WakeComputeError}, - messages::ColdStartInfo, CachedNodeInfo, }, context::RequestMonitoring, @@ -28,6 +27,7 @@ impl PoolingBackend { pub async fn authenticate( &self, ctx: &mut RequestMonitoring, + config: &AuthenticationConfig, conn_info: &ConnInfo, ) -> Result { let user_info = conn_info.user_info.clone(); @@ -44,6 +44,7 @@ impl PoolingBackend { let secret = match cached_secret.value.clone() { Some(secret) => self.config.authentication_config.check_rate_limit( ctx, + config, secret, &user_info.endpoint, true, @@ -57,7 +58,10 @@ impl PoolingBackend { let auth_outcome = crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?; let res = match auth_outcome { - crate::sasl::Outcome::Success(key) => Ok(key), + crate::sasl::Outcome::Success(key) => { + info!("user successfully authenticated"); + Ok(key) + } crate::sasl::Outcome::Failure(reason) => { info!("auth backend failed with an error: {reason}"); Err(AuthError::auth_failed(&*conn_info.user_info.user)) @@ -89,8 +93,6 @@ impl PoolingBackend { }; if let Some(client) = maybe_client { - info!("cold_start_info=warm"); - ctx.set_cold_start_info(ColdStartInfo::Warm); return Ok(client); } let conn_id = uuid::Uuid::new_v4(); diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index c7e8eaef76..798e488509 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,6 +1,5 @@ use dashmap::DashMap; use futures::{future::poll_fn, Future}; -use metrics::IntCounterPairGuard; use parking_lot::RwLock; use rand::Rng; use smallvec::SmallVec; @@ -16,13 +15,13 @@ use std::{ use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; +use tokio_util::sync::CancellationToken; -use crate::console::messages::MetricsAuxInfo; -use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL}; +use crate::console::messages::{ColdStartInfo, MetricsAuxInfo}; +use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; use crate::{ - auth::backend::ComputeUserInfo, context::RequestMonitoring, metrics::NUM_DB_CONNECTIONS_GAUGE, - DbName, EndpointCacheKey, RoleName, + auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName, }; use tracing::{debug, error, warn, Span}; @@ -78,7 +77,7 @@ pub struct EndpointConnPool { pools: HashMap<(DbName, RoleName), DbUserConnPool>, total_conns: usize, max_conns: usize, - _guard: IntCounterPairGuard, + _guard: HttpEndpointPoolsGuard<'static>, global_connections_count: Arc, global_pool_size_max_conns: usize, } @@ -110,7 +109,11 @@ impl EndpointConnPool { let removed = old_len - new_len; if removed > 0 { global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); } *total_conns -= removed; removed > 0 @@ -156,7 +159,11 @@ impl EndpointConnPool { pool.total_conns += 1; pool.global_connections_count .fetch_add(1, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.inc(); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .inc(); } pool.total_conns @@ -176,7 +183,11 @@ impl Drop for EndpointConnPool { if self.total_conns > 0 { self.global_connections_count .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(self.total_conns as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(self.total_conns as i64); } } } @@ -215,7 +226,11 @@ impl DbUserConnPool { removed += 1; } global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); conn } } @@ -303,7 +318,10 @@ impl GlobalConnPool { // acquire a random shard lock let mut shard = self.global_pool.shards()[shard].write(); - let timer = GC_LATENCY.start_timer(); + let timer = Metrics::get() + .proxy + .http_pool_reclaimation_lag_seconds + .start_timer(); let current_len = shard.len(); let mut clients_removed = 0; shard.retain(|endpoint, x| { @@ -331,7 +349,7 @@ impl GlobalConnPool { let new_len = shard.len(); drop(shard); - timer.observe_duration(); + timer.observe(); // Do logging outside of the lock. if clients_removed > 0 { @@ -339,7 +357,11 @@ impl GlobalConnPool { .global_connections_count .fetch_sub(clients_removed, atomic::Ordering::Relaxed) - clients_removed; - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(clients_removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(clients_removed as i64); info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); } let removed = current_len - new_len; @@ -383,9 +405,12 @@ impl GlobalConnPool { "pid", &tracing::field::display(client.inner.get_process_id()), ); - info!("pool: reusing connection '{conn_info}'"); + info!( + cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), + "pool: reusing connection '{conn_info}'" + ); client.session.send(ctx.session_id)?; - ctx.latency_timer.pool_hit(); + ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); ctx.latency_timer.success(); return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); } @@ -407,7 +432,7 @@ impl GlobalConnPool { pools: HashMap::new(), total_conns: 0, max_conns: self.config.pool_options.max_conns_per_endpoint, - _guard: ENDPOINT_POOLS.guard(), + _guard: Metrics::get().proxy.http_endpoint_pools.guard(), global_connections_count: self.global_connections_count.clone(), global_pool_size_max_conns: self.config.pool_options.max_total_conns, })); @@ -447,15 +472,14 @@ pub fn poll_client( conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> Client { - let conn_gauge = NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(); + let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol); let mut session_id = ctx.session_id; let (tx, mut rx) = tokio::sync::watch::channel(session_id); let span = info_span!(parent: None, "connection", %conn_id); + let cold_start_info = ctx.cold_start_info; span.in_scope(|| { - info!(%conn_info, %session_id, "new connection"); + info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); }); let pool = match conn_info.endpoint_cache_key() { Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)), @@ -465,15 +489,32 @@ pub fn poll_client( let db_user = conn_info.db_and_user(); let idle = global_pool.get_idle_timeout(); + let cancel = CancellationToken::new(); + let cancelled = cancel.clone().cancelled_owned(); + tokio::spawn( async move { let _conn_gauge = conn_gauge; let mut idle_timeout = pin!(tokio::time::sleep(idle)); + let mut cancelled = pin!(cancelled); + poll_fn(move |cx| { - if matches!(rx.has_changed(), Ok(true)) { - session_id = *rx.borrow_and_update(); - info!(%session_id, "changed session"); - idle_timeout.as_mut().reset(Instant::now() + idle); + if cancelled.as_mut().poll(cx).is_ready() { + info!("connection dropped"); + return Poll::Ready(()) + } + + match rx.has_changed() { + Ok(true) => { + session_id = *rx.borrow_and_update(); + info!(%session_id, "changed session"); + idle_timeout.as_mut().reset(Instant::now() + idle); + } + Err(_) => { + info!("connection dropped"); + return Poll::Ready(()) + } + _ => {} } // 5 minute idle connection timeout @@ -528,6 +569,7 @@ pub fn poll_client( let inner = ClientInner { inner: client, session: tx, + cancel, aux, conn_id, }; @@ -537,10 +579,18 @@ pub fn poll_client( struct ClientInner { inner: C, session: tokio::sync::watch::Sender, + cancel: CancellationToken, aux: MetricsAuxInfo, conn_id: uuid::Uuid, } +impl Drop for ClientInner { + fn drop(&mut self) { + // on client drop, tell the conn to shut down + self.cancel.cancel(); + } +} + pub trait ClientInnerExt: Sync + Send + 'static { fn is_closed(&self) -> bool; fn get_process_id(&self) -> i32; @@ -565,8 +615,8 @@ impl Client { pub fn metrics(&self) -> Arc { let aux = &self.inner.as_ref().unwrap().aux; USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id.clone(), - branch_id: aux.branch_id.clone(), + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, }) } } @@ -666,6 +716,8 @@ impl Drop for Client { mod tests { use std::{mem, sync::atomic::AtomicBool}; + use crate::{BranchId, EndpointId, ProjectId}; + use super::*; struct MockClient(Arc); @@ -691,7 +743,13 @@ mod tests { ClientInner { inner: client, session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()), - aux: Default::default(), + cancel: CancellationToken::new(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, conn_id: uuid::Uuid::new_v4(), } } diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs new file mode 100644 index 0000000000..ab9127b13e --- /dev/null +++ b/proxy/src/serverless/http_util.rs @@ -0,0 +1,92 @@ +//! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility +//! Will merge back in at some point in the future. + +use bytes::Bytes; + +use anyhow::Context; +use http::{Response, StatusCode}; +use http_body_util::Full; + +use serde::Serialize; +use utils::http::error::ApiError; + +/// Like [`ApiError::into_response`] +pub fn api_error_into_response(this: ApiError) -> Response> { + match this { + ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( + format!("{err:#?}"), // use debug printing so that we give the cause + StatusCode::BAD_REQUEST, + ), + ApiError::Forbidden(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::FORBIDDEN) + } + ApiError::Unauthorized(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::UNAUTHORIZED) + } + ApiError::NotFound(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::NOT_FOUND) + } + ApiError::Conflict(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::CONFLICT) + } + ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status( + this.to_string(), + StatusCode::PRECONDITION_FAILED, + ), + ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status( + "Shutting down".to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), + ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), + ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::REQUEST_TIMEOUT, + ), + ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ), + } +} + +/// Same as [`utils::http::error::HttpErrorBody`] +#[derive(Serialize)] +struct HttpErrorBody { + pub msg: String, +} + +impl HttpErrorBody { + /// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`] + fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response> { + HttpErrorBody { msg }.to_response(status) + } + + /// Same as [`utils::http::error::HttpErrorBody::to_response`] + fn to_response(&self, status: StatusCode) -> Response> { + Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + // we do not have nested maps with non string keys so serialization shouldn't fail + .body(Full::new(Bytes::from(serde_json::to_string(self).unwrap()))) + .unwrap() + } +} + +/// Same as [`utils::http::json::json_response`] +pub fn json_response( + status: StatusCode, + data: T, +) -> Result>, ApiError> { + let json = serde_json::to_string(&data) + .context("Failed to serialize JSON response") + .map_err(ApiError::InternalServerError)?; + let response = Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + .body(Full::new(Bytes::from(json))) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + Ok(response) +} diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index f675375ff1..e856053a7e 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1,18 +1,22 @@ use std::pin::pin; use std::sync::Arc; +use bytes::Bytes; use futures::future::select; use futures::future::try_join; use futures::future::Either; use futures::StreamExt; use futures::TryFutureExt; -use hyper::body::HttpBody; -use hyper::header; -use hyper::http::HeaderName; -use hyper::http::HeaderValue; -use hyper::Response; -use hyper::StatusCode; -use hyper::{Body, HeaderMap, Request}; +use http_body_util::BodyExt; +use http_body_util::Full; +use hyper1::body::Body; +use hyper1::body::Incoming; +use hyper1::header; +use hyper1::http::HeaderName; +use hyper1::http::HeaderValue; +use hyper1::Response; +use hyper1::StatusCode; +use hyper1::{HeaderMap, Request}; use serde_json::json; use serde_json::Value; use tokio::time; @@ -29,7 +33,6 @@ use tracing::error; use tracing::info; use url::Url; use utils::http::error::ApiError; -use utils::http::json::json_response; use crate::auth::backend::ComputeUserInfo; use crate::auth::endpoint_sni; @@ -40,15 +43,19 @@ use crate::context::RequestMonitoring; use crate::error::ErrorKind; use crate::error::ReportableError; use crate::error::UserFacingError; -use crate::metrics::HTTP_CONTENT_LENGTH; -use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE; +use crate::metrics::HttpDirection; +use crate::metrics::Metrics; +use crate::proxy::run_until_cancelled; use crate::proxy::NeonOptions; use crate::serverless::backend::HttpConnError; +use crate::usage_metrics::MetricCounterRecorder; use crate::DbName; use crate::RoleName; use super::backend::PoolingBackend; +use super::conn_pool::Client; use super::conn_pool::ConnInfo; +use super::http_util::json_response; use super::json::json_to_pg_text; use super::json::pg_text_row_to_json; use super::json::JsonConversionError; @@ -215,18 +222,11 @@ fn get_conn_info( pub async fn handle( config: &'static ProxyConfig, mut ctx: RequestMonitoring, - request: Request, + request: Request, backend: Arc, cancel: CancellationToken, -) -> Result, ApiError> { - let cancel2 = cancel.clone(); - let handle = tokio::spawn(async move { - time::sleep(config.http_config.request_timeout).await; - cancel2.cancel(); - }); - +) -> Result>, ApiError> { let result = handle_inner(cancel, config, &mut ctx, request, backend).await; - handle.abort(); let mut response = match result { Ok(r) => { @@ -237,10 +237,7 @@ pub async fn handle( let error_kind = e.get_error_kind(); ctx.set_error_kind(error_kind); - let message = format!( - "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections", - config.http_config.request_timeout.as_secs_f64() - ); + let message = "Query cancelled, connection was terminated"; tracing::info!( kind=error_kind.to_metric_label(), @@ -339,10 +336,9 @@ pub async fn handle( } }; - response.headers_mut().insert( - "Access-Control-Allow-Origin", - hyper::http::HeaderValue::from_static("*"), - ); + response + .headers_mut() + .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*")); Ok(response) } @@ -403,7 +399,7 @@ impl UserFacingError for SqlOverHttpError { #[derive(Debug, thiserror::Error)] pub enum ReadPayloadError { #[error("could not read the HTTP request body: {0}")] - Read(#[from] hyper::Error), + Read(#[from] hyper1::Error), #[error("could not parse the HTTP request body: {0}")] Parse(#[from] serde_json::Error), } @@ -434,59 +430,101 @@ impl ReportableError for SqlOverHttpCancel { } } +#[derive(Clone, Copy, Debug)] +struct HttpHeaders { + raw_output: bool, + default_array_mode: bool, + txn_isolation_level: Option, + txn_read_only: bool, + txn_deferrable: bool, +} + +impl HttpHeaders { + fn try_parse(headers: &hyper1::http::HeaderMap) -> Result { + // Determine the output options. Default behaviour is 'false'. Anything that is not + // strictly 'true' assumed to be false. + let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); + let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); + + // isolation level, read only and deferrable + let txn_isolation_level = match headers.get(&TXN_ISOLATION_LEVEL) { + Some(x) => Some( + map_header_to_isolation_level(x).ok_or(SqlOverHttpError::InvalidIsolationLevel)?, + ), + None => None, + }; + + let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE); + let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE); + + Ok(Self { + raw_output, + default_array_mode, + txn_isolation_level, + txn_read_only, + txn_deferrable, + }) + } +} + +fn map_header_to_isolation_level(level: &HeaderValue) -> Option { + match level.as_bytes() { + b"Serializable" => Some(IsolationLevel::Serializable), + b"ReadUncommitted" => Some(IsolationLevel::ReadUncommitted), + b"ReadCommitted" => Some(IsolationLevel::ReadCommitted), + b"RepeatableRead" => Some(IsolationLevel::RepeatableRead), + _ => None, + } +} + +fn map_isolation_level_to_headers(level: IsolationLevel) -> Option { + match level { + IsolationLevel::ReadUncommitted => Some(HeaderValue::from_static("ReadUncommitted")), + IsolationLevel::ReadCommitted => Some(HeaderValue::from_static("ReadCommitted")), + IsolationLevel::RepeatableRead => Some(HeaderValue::from_static("RepeatableRead")), + IsolationLevel::Serializable => Some(HeaderValue::from_static("Serializable")), + _ => None, + } +} + async fn handle_inner( cancel: CancellationToken, config: &'static ProxyConfig, ctx: &mut RequestMonitoring, - request: Request, + request: Request, backend: Arc, -) -> Result, SqlOverHttpError> { - let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(); - info!("handling interactive connection from client"); +) -> Result>, SqlOverHttpError> { + let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol); + info!( + protocol = %ctx.protocol, + "handling interactive connection from client" + ); // // Determine the destination and connection params // let headers = request.headers(); + // TLS config should be there. let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?; info!(user = conn_info.user_info.user.as_str(), "credentials"); - // Determine the output options. Default behaviour is 'false'. Anything that is not - // strictly 'true' assumed to be false. - let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); - let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); - // Allow connection pooling only if explicitly requested // or if we have decided that http pool is no longer opt-in let allow_pool = !config.http_config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); - // isolation level, read only and deferrable - - let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned(); - let txn_isolation_level = match txn_isolation_level_raw { - Some(ref x) => Some(match x.as_bytes() { - b"Serializable" => IsolationLevel::Serializable, - b"ReadUncommitted" => IsolationLevel::ReadUncommitted, - b"ReadCommitted" => IsolationLevel::ReadCommitted, - b"RepeatableRead" => IsolationLevel::RepeatableRead, - _ => return Err(SqlOverHttpError::InvalidIsolationLevel), - }), - None => None, - }; - - let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE); - let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE); + let parsed_headers = HttpHeaders::try_parse(headers)?; let request_content_length = match request.body().size_hint().upper() { Some(v) => v, None => MAX_REQUEST_SIZE + 1, }; info!(request_content_length, "request size in bytes"); - HTTP_CONTENT_LENGTH.observe(request_content_length as f64); + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Request, request_content_length as f64); // we don't have a streaming request support yet so this is to prevent OOM // from a malicious user sending an extremely large request body @@ -495,7 +533,7 @@ async fn handle_inner( } let fetch_and_process_request = async { - let body = hyper::body::to_bytes(request.into_body()).await?; + let body = request.into_body().collect().await?.to_bytes(); info!(length = body.len(), "request payload read"); let payload: Payload = serde_json::from_slice(&body)?; Ok::(payload) // Adjust error type accordingly @@ -503,7 +541,9 @@ async fn handle_inner( .map_err(SqlOverHttpError::from); let authenticate_and_connect = async { - let keys = backend.authenticate(ctx, &conn_info).await?; + let keys = backend + .authenticate(ctx, &config.authentication_config, &conn_info) + .await?; let client = backend .connect_to_compute(ctx, conn_info, keys, !allow_pool) .await?; @@ -514,20 +554,18 @@ async fn handle_inner( } .map_err(SqlOverHttpError::from); - // Run both operations in parallel - let (payload, mut client) = match select( + let (payload, mut client) = match run_until_cancelled( + // Run both operations in parallel try_join( pin!(fetch_and_process_request), pin!(authenticate_and_connect), ), - pin!(cancel.cancelled()), + &cancel, ) .await { - Either::Left((result, _cancelled)) => result?, - Either::Right((_cancelled, _)) => { - return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)) - } + Some(result) => result?, + None => return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)), }; let mut response = Response::builder() @@ -537,95 +575,145 @@ async fn handle_inner( // // Now execute the query and return the result // - let mut size = 0; let result = match payload { - Payload::Single(stmt) => { - let mut size = 0; - let (inner, mut discard) = client.inner(); - let cancel_token = inner.cancel_token(); - let query = pin!(query_to_json( - &*inner, - stmt, - &mut size, - raw_output, - default_array_mode - )); - let cancelled = pin!(cancel.cancelled()); - let res = select(query, cancelled).await; - match res { - Either::Left((Ok((status, results)), _cancelled)) => { - discard.check_idle(status); - results - } - Either::Left((Err(e), _cancelled)) => { - discard.discard(); - return Err(e); - } - Either::Right((_cancelled, query)) => { - if let Err(err) = cancel_token.cancel_query(NoTls).await { - tracing::error!(?err, "could not cancel query"); - } - match time::timeout(time::Duration::from_millis(100), query).await { - Ok(Ok((status, results))) => { - discard.check_idle(status); - results - } - Ok(Err(error)) => { - let db_error = match &error { - SqlOverHttpError::ConnectCompute( - HttpConnError::ConnectionError(e), - ) - | SqlOverHttpError::Postgres(e) => e.as_db_error(), - _ => None, - }; - - // if errored for some other reason, it might not be safe to return - if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) { - discard.discard(); - } - - return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); - } - Err(_timeout) => { - discard.discard(); - return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); - } - } - } - } - } + Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?, Payload::Batch(statements) => { - info!("starting transaction"); - let (inner, mut discard) = client.inner(); - let cancel_token = inner.cancel_token(); - let mut builder = inner.build_transaction(); - if let Some(isolation_level) = txn_isolation_level { - builder = builder.isolation_level(isolation_level); + if parsed_headers.txn_read_only { + response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE); } - if txn_read_only { - builder = builder.read_only(true); + if parsed_headers.txn_deferrable { + response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE); } - if txn_deferrable { - builder = builder.deferrable(true); - } - - let transaction = builder.start().await.map_err(|e| { - // if we cannot start a transaction, we should return immediately - // and not return to the pool. connection is clearly broken - discard.discard(); - e - })?; - - let results = match query_batch( - cancel.child_token(), - &transaction, - statements, - &mut size, - raw_output, - default_array_mode, - ) - .await + if let Some(txn_isolation_level) = parsed_headers + .txn_isolation_level + .and_then(map_isolation_level_to_headers) { + response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); + } + + statements + .process(cancel, &mut client, parsed_headers) + .await? + } + }; + + let metrics = client.metrics(); + + // how could this possibly fail + let body = serde_json::to_string(&result).expect("json serialization should not fail"); + let len = body.len(); + let response = response + .body(Full::new(Bytes::from(body))) + // only fails if invalid status code or invalid header/values are given. + // these are not user configurable so it cannot fail dynamically + .expect("building response payload should not fail"); + + // count the egress bytes - we miss the TLS and header overhead but oh well... + // moving this later in the stack is going to be a lot of effort and ehhhh + metrics.record_egress(len as u64); + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Response, len as f64); + + Ok(response) +} + +impl QueryData { + async fn process( + self, + cancel: CancellationToken, + client: &mut Client, + parsed_headers: HttpHeaders, + ) -> Result { + let (inner, mut discard) = client.inner(); + let cancel_token = inner.cancel_token(); + + let res = match select( + pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)), + pin!(cancel.cancelled()), + ) + .await + { + // The query successfully completed. + Either::Left((Ok((status, results)), __not_yet_cancelled)) => { + discard.check_idle(status); + Ok(results) + } + // The query failed with an error + Either::Left((Err(e), __not_yet_cancelled)) => { + discard.discard(); + return Err(e); + } + // The query was cancelled. + Either::Right((_cancelled, query)) => { + tracing::info!("cancelling query"); + if let Err(err) = cancel_token.cancel_query(NoTls).await { + tracing::error!(?err, "could not cancel query"); + } + // wait for the query cancellation + match time::timeout(time::Duration::from_millis(100), query).await { + // query successed before it was cancelled. + Ok(Ok((status, results))) => { + discard.check_idle(status); + Ok(results) + } + // query failed or was cancelled. + Ok(Err(error)) => { + let db_error = match &error { + SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e)) + | SqlOverHttpError::Postgres(e) => e.as_db_error(), + _ => None, + }; + + // if errored for some other reason, it might not be safe to return + if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) { + discard.discard(); + } + + Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)) + } + Err(_timeout) => { + discard.discard(); + Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)) + } + } + } + }; + res + } +} + +impl BatchQueryData { + async fn process( + self, + cancel: CancellationToken, + client: &mut Client, + parsed_headers: HttpHeaders, + ) -> Result { + info!("starting transaction"); + let (inner, mut discard) = client.inner(); + let cancel_token = inner.cancel_token(); + let mut builder = inner.build_transaction(); + if let Some(isolation_level) = parsed_headers.txn_isolation_level { + builder = builder.isolation_level(isolation_level); + } + if parsed_headers.txn_read_only { + builder = builder.read_only(true); + } + if parsed_headers.txn_deferrable { + builder = builder.deferrable(true); + } + + let transaction = builder.start().await.map_err(|e| { + // if we cannot start a transaction, we should return immediately + // and not return to the pool. connection is clearly broken + discard.discard(); + e + })?; + + let results = + match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await { Ok(results) => { info!("commit"); let status = transaction.commit().await.map_err(|e| { @@ -659,44 +747,15 @@ async fn handle_inner( } }; - if txn_read_only { - response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE); - } - if txn_deferrable { - response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE); - } - if let Some(txn_isolation_level) = txn_isolation_level_raw { - response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); - } - json!({ "results": results }) - } - }; - - let metrics = client.metrics(); - - // how could this possibly fail - let body = serde_json::to_string(&result).expect("json serialization should not fail"); - let len = body.len(); - let response = response - .body(Body::from(body)) - // only fails if invalid status code or invalid header/values are given. - // these are not user configurable so it cannot fail dynamically - .expect("building response payload should not fail"); - - // count the egress bytes - we miss the TLS and header overhead but oh well... - // moving this later in the stack is going to be a lot of effort and ehhhh - metrics.record_egress(len as u64); - - Ok(response) + Ok(json!({ "results": results })) + } } async fn query_batch( cancel: CancellationToken, transaction: &Transaction<'_>, queries: BatchQueryData, - total_size: &mut usize, - raw_output: bool, - array_mode: bool, + parsed_headers: HttpHeaders, ) -> Result, SqlOverHttpError> { let mut results = Vec::with_capacity(queries.queries.len()); let mut current_size = 0; @@ -705,8 +764,7 @@ async fn query_batch( transaction, stmt, &mut current_size, - raw_output, - array_mode + parsed_headers, )); let cancelled = pin!(cancel.cancelled()); let res = select(query, cancelled).await; @@ -723,7 +781,6 @@ async fn query_batch( } } } - *total_size += current_size; Ok(results) } @@ -731,8 +788,7 @@ async fn query_to_json( client: &T, data: QueryData, current_size: &mut usize, - raw_output: bool, - default_array_mode: bool, + parsed_headers: HttpHeaders, ) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> { info!("executing query"); let query_params = data.params; @@ -792,12 +848,12 @@ async fn query_to_json( columns.push(client.get_type(c.type_oid()).await?); } - let array_mode = data.array_mode.unwrap_or(default_array_mode); + let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode); // convert rows to JSON let rows = rows .iter() - .map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode)) + .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode)) .collect::, _>>()?; // resulting JSON format is based on the format of node-postgres result diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs deleted file mode 100644 index 33f194dd59..0000000000 --- a/proxy/src/serverless/tls_listener.rs +++ /dev/null @@ -1,123 +0,0 @@ -use std::{ - convert::Infallible, - pin::Pin, - task::{Context, Poll}, - time::Duration, -}; - -use hyper::server::{accept::Accept, conn::AddrStream}; -use pin_project_lite::pin_project; -use tokio::{ - io::{AsyncRead, AsyncWrite}, - task::JoinSet, - time::timeout, -}; -use tokio_rustls::{server::TlsStream, TlsAcceptor}; -use tracing::{info, warn, Instrument}; - -use crate::{ - metrics::TLS_HANDSHAKE_FAILURES, - protocol2::{WithClientIp, WithConnectionGuard}, -}; - -pin_project! { - /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself - /// encrypted using TLS. - pub(crate) struct TlsListener { - #[pin] - listener: A, - tls: TlsAcceptor, - waiting: JoinSet>>, - timeout: Duration, - } -} - -impl TlsListener { - /// Create a `TlsListener` with default options. - pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self { - TlsListener { - listener, - tls, - waiting: JoinSet::new(), - timeout, - } - } -} - -impl Accept for TlsListener -where - A: Accept>>, - A::Error: std::error::Error, - A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static, -{ - type Conn = TlsStream; - - type Error = Infallible; - - fn poll_accept( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - let mut this = self.project(); - - loop { - match this.listener.as_mut().poll_accept(cx) { - Poll::Pending => break, - Poll::Ready(Some(Ok(mut conn))) => { - let t = *this.timeout; - let tls = this.tls.clone(); - let span = conn.span.clone(); - this.waiting.spawn(async move { - let peer_addr = match conn.inner.wait_for_addr().await { - Ok(Some(addr)) => addr, - Err(e) => { - tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); - return None; - } - Ok(None) => conn.inner.inner.remote_addr() - }; - - let accept = tls.accept(conn); - match timeout(t, accept).await { - Ok(Ok(conn)) => { - info!(%peer_addr, "accepted new TLS connection"); - Some(conn) - }, - // The handshake failed, try getting another connection from the queue - Ok(Err(e)) => { - TLS_HANDSHAKE_FAILURES.inc(); - warn!(%peer_addr, "failed to accept TLS connection: {e:?}"); - None - } - // The handshake timed out, try getting another connection from the queue - Err(_) => { - TLS_HANDSHAKE_FAILURES.inc(); - warn!(%peer_addr, "failed to accept TLS connection: timeout"); - None - } - } - }.instrument(span)); - } - Poll::Ready(Some(Err(e))) => { - tracing::error!("error accepting TCP connection: {e}"); - continue; - } - Poll::Ready(None) => return Poll::Ready(None), - } - } - - loop { - return match this.waiting.poll_join_next(cx) { - Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))), - // The handshake failed to complete, try getting another connection from the queue - Poll::Ready(Some(Ok(None))) => continue, - // The handshake panicked or was cancelled. ignore and get another connection - Poll::Ready(Some(Err(e))) => { - tracing::warn!("handshake aborted: {e}"); - continue; - } - _ => Poll::Pending, - }; - } - } -} diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index ada6c974f4..d054877126 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -3,7 +3,7 @@ use crate::{ config::ProxyConfig, context::RequestMonitoring, error::{io_error, ReportableError}, - metrics::NUM_CLIENT_CONNECTION_GAUGE, + metrics::Metrics, proxy::{handle_client, ClientMode}, rate_limiter::EndpointRateLimiter, }; @@ -139,9 +139,10 @@ pub async fn serve_websocket( endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { let websocket = websocket.await?; - let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["ws"]) - .guard(); + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Ws); let res = handle_client( config, diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index b6b7a85659..690e92ffb1 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,6 +1,6 @@ use crate::config::TlsServerEndPoint; use crate::error::{ErrorKind, ReportableError, UserFacingError}; -use crate::metrics::TLS_HANDSHAKE_FAILURES; +use crate::metrics::Metrics; use bytes::BytesMut; use pq_proto::framed::{ConnectionError, Framed}; @@ -223,12 +223,20 @@ pub enum StreamUpgradeError { impl Stream { /// If possible, upgrade raw stream into a secure TLS-based stream. - pub async fn upgrade(self, cfg: Arc) -> Result, StreamUpgradeError> { + pub async fn upgrade( + self, + cfg: Arc, + record_handshake_error: bool, + ) -> Result, StreamUpgradeError> { match self { Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg) .accept(raw) .await - .inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?), + .inspect_err(|_| { + if record_handshake_error { + Metrics::get().proxy.tls_handshake_failures.inc() + } + })?), Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index d75aedf89b..56ed2145dc 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -1,20 +1,35 @@ //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. -use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId}; -use chrono::{DateTime, Utc}; +use crate::{ + config::{MetricBackupCollectionConfig, MetricCollectionConfig}, + context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, + http, + intern::{BranchIdInt, EndpointIdInt}, +}; +use anyhow::Context; +use async_compression::tokio::write::GzipEncoder; +use bytes::Bytes; +use chrono::{DateTime, Datelike, Timelike, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use dashmap::{mapref::entry::Entry, DashMap}; +use futures::future::select; use once_cell::sync::Lazy; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; use std::{ convert::Infallible, + pin::pin, sync::{ atomic::{AtomicU64, AtomicUsize, Ordering}, Arc, }, time::Duration, }; +use tokio::io::AsyncWriteExt; +use tokio_util::sync::CancellationToken; use tracing::{error, info, instrument, trace}; +use utils::backoff; +use uuid::{NoContext, Timestamp}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; @@ -29,23 +44,97 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); /// because we enrich the event with project_id in the control-plane endpoint. #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] pub struct Ids { - pub endpoint_id: EndpointId, - pub branch_id: BranchId, + pub endpoint_id: EndpointIdInt, + pub branch_id: BranchIdInt, +} + +pub trait MetricCounterRecorder { + /// Record that some bytes were sent from the proxy to the client + fn record_egress(&self, bytes: u64); + /// Record that some connections were opened + fn record_connection(&self, count: usize); +} + +trait MetricCounterReporter { + fn get_metrics(&mut self) -> (u64, usize); + fn move_metrics(&self) -> (u64, usize); +} + +#[derive(Debug)] +struct MetricBackupCounter { + transmitted: AtomicU64, + opened_connections: AtomicUsize, +} + +impl MetricCounterRecorder for MetricBackupCounter { + fn record_egress(&self, bytes: u64) { + self.transmitted.fetch_add(bytes, Ordering::AcqRel); + } + + fn record_connection(&self, count: usize) { + self.opened_connections.fetch_add(count, Ordering::AcqRel); + } +} + +impl MetricCounterReporter for MetricBackupCounter { + fn get_metrics(&mut self) -> (u64, usize) { + ( + *self.transmitted.get_mut(), + *self.opened_connections.get_mut(), + ) + } + fn move_metrics(&self) -> (u64, usize) { + ( + self.transmitted.swap(0, Ordering::AcqRel), + self.opened_connections.swap(0, Ordering::AcqRel), + ) + } } #[derive(Debug)] pub struct MetricCounter { transmitted: AtomicU64, opened_connections: AtomicUsize, + backup: Arc, } -impl MetricCounter { +impl MetricCounterRecorder for MetricCounter { /// Record that some bytes were sent from the proxy to the client - pub fn record_egress(&self, bytes: u64) { + fn record_egress(&self, bytes: u64) { self.transmitted.fetch_add(bytes, Ordering::AcqRel); + self.backup.record_egress(bytes); } + /// Record that some connections were opened + fn record_connection(&self, count: usize) { + self.opened_connections.fetch_add(count, Ordering::AcqRel); + self.backup.record_connection(count); + } +} + +impl MetricCounterReporter for MetricCounter { + fn get_metrics(&mut self) -> (u64, usize) { + ( + *self.transmitted.get_mut(), + *self.opened_connections.get_mut(), + ) + } + fn move_metrics(&self) -> (u64, usize) { + ( + self.transmitted.swap(0, Ordering::AcqRel), + self.opened_connections.swap(0, Ordering::AcqRel), + ) + } +} + +trait Clearable { /// extract the value that should be reported + fn should_report(self: &Arc) -> Option; + /// Determine whether the counter should be cleared from the global map. + fn should_clear(self: &mut Arc) -> bool; +} + +impl Clearable for C { fn should_report(self: &Arc) -> Option { // heuristic to see if the branch is still open // if a clone happens while we are observing, the heuristic will be incorrect. @@ -54,13 +143,12 @@ impl MetricCounter { // However, for the strong count to be 1 it must have occured that at one instant // all the endpoints were closed, so missing a report because the endpoints are closed is valid. let is_open = Arc::strong_count(self) > 1; - let opened = self.opened_connections.swap(0, Ordering::AcqRel); // update cached metrics eagerly, even if they can't get sent // (to avoid sending the same metrics twice) // see the relevant discussion on why to do so even if the status is not success: // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956 - let value = self.transmitted.swap(0, Ordering::AcqRel); + let (value, opened) = self.move_metrics(); // Our only requirement is that we report in every interval if there was an open connection // if there were no opened connections since, then we don't need to report @@ -70,15 +158,12 @@ impl MetricCounter { Some(value) } } - - /// Determine whether the counter should be cleared from the global map. fn should_clear(self: &mut Arc) -> bool { // we can't clear this entry if it's acquired elsewhere let Some(counter) = Arc::get_mut(self) else { return false; }; - let opened = *counter.opened_connections.get_mut(); - let value = *counter.transmitted.get_mut(); + let (opened, value) = counter.get_metrics(); // clear if there's no data to report value == 0 && opened == 0 } @@ -90,11 +175,26 @@ type FastHasher = std::hash::BuildHasherDefault; #[derive(Default)] pub struct Metrics { endpoints: DashMap, FastHasher>, + backup_endpoints: DashMap, FastHasher>, } impl Metrics { /// Register a new byte metrics counter for this endpoint pub fn register(&self, ids: Ids) -> Arc { + let backup = if let Some(entry) = self.backup_endpoints.get(&ids) { + entry.clone() + } else { + self.backup_endpoints + .entry(ids.clone()) + .or_insert_with(|| { + Arc::new(MetricBackupCounter { + transmitted: AtomicU64::new(0), + opened_connections: AtomicUsize::new(0), + }) + }) + .clone() + }; + let entry = if let Some(entry) = self.endpoints.get(&ids) { entry.clone() } else { @@ -104,12 +204,13 @@ impl Metrics { Arc::new(MetricCounter { transmitted: AtomicU64::new(0), opened_connections: AtomicUsize::new(0), + backup: backup.clone(), }) }) .clone() }; - entry.opened_connections.fetch_add(1, Ordering::AcqRel); + entry.record_connection(1); entry } } @@ -132,7 +233,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result anyhow::Result, - now: DateTime, -) { - info!( - "starting collect_metrics_iteration. metric_collection_endpoint: {}", - metric_collection_endpoint - ); - +fn collect_and_clear_metrics( + endpoints: &DashMap, FastHasher>, +) -> Vec<(Ids, u64)> { let mut metrics_to_clear = Vec::new(); - let metrics_to_send: Vec<(Ids, u64)> = metrics - .endpoints + let metrics_to_send: Vec<(Ids, u64)> = endpoints .iter() .filter_map(|counter| { let key = counter.key().clone(); @@ -173,33 +262,71 @@ async fn collect_metrics_iteration( }) .collect(); + for metric in metrics_to_clear { + match endpoints.entry(metric) { + Entry::Occupied(mut counter) => { + if counter.get_mut().should_clear() { + counter.remove_entry(); + } + } + Entry::Vacant(_) => {} + } + } + metrics_to_send +} + +fn create_event_chunks<'a>( + metrics_to_send: &'a [(Ids, u64)], + hostname: &'a str, + prev: DateTime, + now: DateTime, + chunk_size: usize, +) -> impl Iterator>> + 'a { + // Split into chunks of 1000 metrics to avoid exceeding the max request size + metrics_to_send + .chunks(chunk_size) + .map(move |chunk| EventChunk { + events: chunk + .iter() + .map(|(ids, value)| Event { + kind: EventType::Incremental { + start_time: prev, + stop_time: now, + }, + metric: PROXY_IO_BYTES_PER_CLIENT, + idempotency_key: idempotency_key(hostname), + value: *value, + extra: ids.clone(), + }) + .collect(), + }) +} + +#[instrument(skip_all)] +async fn collect_metrics_iteration( + endpoints: &DashMap, FastHasher>, + client: &http::ClientWithMiddleware, + metric_collection_endpoint: &reqwest::Url, + hostname: &str, + prev: DateTime, + now: DateTime, +) { + info!( + "starting collect_metrics_iteration. metric_collection_endpoint: {}", + metric_collection_endpoint + ); + + let metrics_to_send = collect_and_clear_metrics(endpoints); + if metrics_to_send.is_empty() { trace!("no new metrics to send"); } // Send metrics. - // Split into chunks of 1000 metrics to avoid exceeding the max request size - for chunk in metrics_to_send.chunks(CHUNK_SIZE) { - let events = chunk - .iter() - .map(|(ids, value)| Event { - kind: EventType::Incremental { - start_time: prev, - stop_time: now, - }, - metric: PROXY_IO_BYTES_PER_CLIENT, - idempotency_key: idempotency_key(hostname), - value: *value, - extra: Ids { - endpoint_id: ids.endpoint_id.clone(), - branch_id: ids.branch_id.clone(), - }, - }) - .collect(); - + for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) { let res = client .post(metric_collection_endpoint.clone()) - .json(&EventChunk { events }) + .json(&chunk) .send() .await; @@ -213,23 +340,142 @@ async fn collect_metrics_iteration( if !res.status().is_success() { error!("metrics endpoint refused the sent metrics: {:?}", res); - for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) { + for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) { // Report if the metric value is suspiciously large error!("potentially abnormal metric value: {:?}", metric); } } } +} - for metric in metrics_to_clear { - match metrics.endpoints.entry(metric) { - Entry::Occupied(mut counter) => { - if counter.get_mut().should_clear() { - counter.remove_entry(); - } - } - Entry::Vacant(_) => {} +pub async fn task_backup( + backup_config: &MetricBackupCollectionConfig, + cancellation_token: CancellationToken, +) -> anyhow::Result<()> { + info!("metrics backup config: {backup_config:?}"); + scopeguard::defer! { + info!("metrics backup has shut down"); + } + // Even if the remote storage is not configured, we still want to clear the metrics. + let storage = backup_config + .remote_storage_config + .as_ref() + .map(|config| GenericRemoteStorage::from_config(config).context("remote storage init")) + .transpose()?; + let mut ticker = tokio::time::interval(backup_config.interval); + let mut prev = Utc::now(); + let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); + loop { + select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await; + let now = Utc::now(); + collect_metrics_backup_iteration( + &USAGE_METRICS.backup_endpoints, + &storage, + &hostname, + prev, + now, + backup_config.chunk_size, + ) + .await; + + prev = now; + if cancellation_token.is_cancelled() { + info!("metrics backup has been cancelled"); + break; } } + Ok(()) +} + +#[instrument(skip_all)] +async fn collect_metrics_backup_iteration( + endpoints: &DashMap, FastHasher>, + storage: &Option, + hostname: &str, + prev: DateTime, + now: DateTime, + chunk_size: usize, +) { + let year = now.year(); + let month = now.month(); + let day = now.day(); + let hour = now.hour(); + let minute = now.minute(); + let second = now.second(); + let cancel = CancellationToken::new(); + + info!("starting collect_metrics_backup_iteration"); + + let metrics_to_send = collect_and_clear_metrics(endpoints); + + if metrics_to_send.is_empty() { + trace!("no new metrics to send"); + } + + // Send metrics. + for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) { + let real_now = Utc::now(); + let id = uuid::Uuid::new_v7(Timestamp::from_unix( + NoContext, + real_now.second().into(), + real_now.nanosecond(), + )); + let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz"); + let remote_path = match RemotePath::from_string(&path) { + Ok(remote_path) => remote_path, + Err(e) => { + error!("failed to create remote path from str {path}: {:?}", e); + continue; + } + }; + + let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await; + + if let Err(e) = res { + error!( + "failed to upload consumption events to remote storage: {:?}", + e + ); + } + } +} + +async fn upload_events_chunk( + storage: &Option, + chunk: EventChunk<'_, Event>, + remote_path: &RemotePath, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + let storage = match storage { + Some(storage) => storage, + None => { + error!("no remote storage configured"); + return Ok(()); + } + }; + let data = serde_json::to_vec(&chunk).context("serialize metrics")?; + let mut encoder = GzipEncoder::new(Vec::new()); + encoder.write_all(&data).await.context("compress metrics")?; + encoder.shutdown().await.context("compress metrics")?; + let compressed_data: Bytes = encoder.get_ref().clone().into(); + backoff::retry( + || async { + let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone()))); + storage + .upload(stream, compressed_data.len(), remote_path, None, cancel) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_UPLOAD_MAX_RETRIES, + "request_data_upload", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("request_data_upload")?; + Ok(()) } #[cfg(test)] @@ -248,8 +494,8 @@ mod tests { }; use url::Url; - use super::{collect_metrics_iteration, Ids, Metrics}; - use crate::{http, rate_limiter::RateLimiterConfig}; + use super::*; + use crate::{http, BranchId, EndpointId}; #[tokio::test] async fn metrics() { @@ -279,23 +525,24 @@ mod tests { tokio::spawn(server); let metrics = Metrics::default(); - let client = http::new_client(RateLimiterConfig::default()); + let client = http::new_client(); let endpoint = Url::parse(&format!("http://{addr}")).unwrap(); let now = Utc::now(); // no counters have been registered - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert!(r.is_empty()); // register a new counter + let counter = metrics.register(Ids { - endpoint_id: "e1".into(), - branch_id: "b1".into(), + endpoint_id: (&EndpointId::from("e1")).into(), + branch_id: (&BranchId::from("b1")).into(), }); // the counter should be observed despite 0 egress - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 1); @@ -305,7 +552,7 @@ mod tests { counter.record_egress(1); // egress should be observered - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 1); @@ -315,11 +562,19 @@ mod tests { drop(counter); // we do not observe the counter - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert!(r.is_empty()); // counter is unregistered assert!(metrics.endpoints.is_empty()); + + collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000) + .await; + assert!(!metrics.backup_endpoints.is_empty()); + collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000) + .await; + // backup counter is unregistered after the second iteration + assert!(metrics.backup_endpoints.is_empty()); } } diff --git a/pyproject.toml b/pyproject.toml index e347d47cbf..156f135062 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,4 +94,5 @@ select = [ "I", # isort "W", # pycodestyle "B", # bugbear + "UP032", # f-string ] diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index cb4a1def1f..c8b732fee1 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -33,6 +33,7 @@ once_cell.workspace = true parking_lot.workspace = true postgres.workspace = true postgres-protocol.workspace = true +rand.workspace = true regex.workspace = true scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 3c4c81e499..e53ccaeb3d 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -28,7 +28,7 @@ use utils::pid_file; use metrics::set_build_info_metric; use safekeeper::defaults::{ DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, - DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, }; use safekeeper::wal_service; use safekeeper::GlobalTimelines; @@ -170,6 +170,13 @@ struct Args { /// still needed for existing replication connection. #[arg(long)] walsenders_keep_horizon: bool, + /// Enable partial backup. If disabled, safekeeper will not upload partial + /// segments to remote storage. + #[arg(long)] + partial_backup_enabled: bool, + /// Controls how long backup will wait until uploading the partial segment. + #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)] + partial_backup_timeout: Duration, } // Like PathBufValueParser, but allows empty string. @@ -300,6 +307,8 @@ async fn main() -> anyhow::Result<()> { http_auth, current_thread_runtime: args.current_thread_runtime, walsenders_keep_horizon: args.walsenders_keep_horizon, + partial_backup_enabled: args.partial_backup_enabled, + partial_backup_timeout: args.partial_backup_timeout, }; // initialize sentry if SENTRY_DSN is provided @@ -365,6 +374,8 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); + wal_backup::init_remote_storage(&conf); + // Keep handles to main tasks to die if any of them disappears. let mut tasks_handles: FuturesUnordered> = FuturesUnordered::new(); diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index d822c87c0e..fe9f2e6899 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -20,7 +20,7 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 7; +pub const SK_FORMAT_VERSION: u32 = 8; // contains persistent metadata for safekeeper const CONTROL_FILE_NAME: &str = "safekeeper.control"; diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 2fd719326d..8f4dfe9b43 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -2,6 +2,7 @@ use crate::{ safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn}, state::{PersistedPeers, TimelinePersistentState}, + wal_backup_partial, }; use anyhow::{bail, Result}; use pq_proto::SystemId; @@ -138,6 +139,50 @@ pub struct SafeKeeperStateV4 { pub peers: PersistedPeers, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SafeKeeperStateV7 { + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, + /// Part of WAL acknowledged by quorum *and available locally*. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. Updates are currently drived + /// only by walproposer. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + // Peers and their state as we remember it. Knowing peers themselves is + // fundamental; but state is saved here only for informational purposes and + // obviously can be stale. (Currently not saved at all, but let's provision + // place to have less file version upgrades). + pub peers: PersistedPeers, +} + pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { @@ -167,6 +212,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result Result Result Result Result>, pub current_thread_runtime: bool, pub walsenders_keep_horizon: bool, + pub partial_backup_enabled: bool, + pub partial_backup_timeout: Duration, } impl SafeKeeperConf { @@ -123,6 +127,8 @@ impl SafeKeeperConf { max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, current_thread_runtime: false, walsenders_keep_horizon: false, + partial_backup_enabled: false, + partial_backup_timeout: Duration::from_secs(0), } } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index e541527b6a..28ae042bb3 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -147,6 +147,21 @@ pub static RECEIVED_PS_FEEDBACKS: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_received_ps_feedbacks_total counter") }); +pub static PARTIAL_BACKUP_UPLOADS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_partial_backup_uploads_total", + "Number of partial backup uploads to the S3", + &["result"] + ) + .expect("Failed to register safekeeper_partial_backup_uploads_total counter") +}); +pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_partial_backup_uploaded_bytes_total", + "Number of bytes uploaded to the S3 during partial backup" + ) + .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter") +}); pub const LABEL_UNKNOWN: &str = "unknown"; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index d7c8fa6955..f2ee0403eb 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -1221,6 +1221,7 @@ mod tests { commit_lsn: Lsn(1234567600), }, )]), + partial_backup: crate::wal_backup_partial::State::default(), }; let ser = state.ser().unwrap(); @@ -1266,6 +1267,8 @@ mod tests { 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, + // partial_backup + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ]; assert_eq!(Hex(&ser), Hex(&expected)); diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index 82f7954051..be5e516296 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -13,6 +13,7 @@ use utils::{ use crate::{ control_file, safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory}, + wal_backup_partial::{self}, }; /// Persistent information stored on safekeeper node about timeline. @@ -54,11 +55,14 @@ pub struct TimelinePersistentState { /// pushed to s3. We don't remove WAL beyond it. Persisted only for /// informational purposes, we receive it from pageserver (or broker). pub remote_consistent_lsn: Lsn, - // Peers and their state as we remember it. Knowing peers themselves is - // fundamental; but state is saved here only for informational purposes and - // obviously can be stale. (Currently not saved at all, but let's provision - // place to have less file version upgrades). + /// Peers and their state as we remember it. Knowing peers themselves is + /// fundamental; but state is saved here only for informational purposes and + /// obviously can be stale. (Currently not saved at all, but let's provision + /// place to have less file version upgrades). pub peers: PersistedPeers, + /// Holds names of partial segments uploaded to remote storage. Used to + /// clean up old objects without leaving garbage in remote storage. + pub partial_backup: wal_backup_partial::State, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] @@ -93,6 +97,7 @@ impl TimelinePersistentState { .map(|p| (*p, PersistedPeerInfo::new())) .collect(), ), + partial_backup: wal_backup_partial::State::default(), } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 4901b86acf..64f764f191 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -38,7 +38,7 @@ use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; use crate::metrics::FullTimelineInfo; use crate::wal_storage::Storage as wal_storage_iface; -use crate::{debug_dump, wal_storage}; +use crate::{debug_dump, wal_backup_partial, wal_storage}; use crate::{GlobalTimelines, SafeKeeperConf}; /// Things safekeeper should know about timeline state on peers. @@ -503,6 +503,9 @@ impl Timeline { if conf.peer_recovery_enabled { tokio::spawn(recovery_main(self.clone(), conf.clone())); } + if conf.is_wal_backup_enabled() && conf.partial_backup_enabled { + tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone())); + } } /// Delete timeline from disk completely, by removing timeline directory. @@ -667,8 +670,8 @@ impl Timeline { term_flush_lsn = TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn())); } - self.commit_lsn_watch_tx.send(commit_lsn)?; self.term_flush_lsn_watch_tx.send(term_flush_lsn)?; + self.commit_lsn_watch_tx.send(commit_lsn)?; Ok(rmsg) } diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 944d80f777..e3f6a606a0 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -18,7 +18,7 @@ use std::time::Duration; use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; use postgres_ffi::XLogFileName; use postgres_ffi::{XLogSegNo, PG_TLI}; -use remote_storage::{GenericRemoteStorage, RemotePath}; +use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata}; use tokio::fs::File; use tokio::select; @@ -180,6 +180,16 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage { .unwrap() } +pub fn init_remote_storage(conf: &SafeKeeperConf) { + // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide + // dependencies to all tasks instead. + REMOTE_STORAGE.get_or_init(|| { + conf.remote_storage + .as_ref() + .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage")) + }); +} + const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; /// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup @@ -194,14 +204,6 @@ pub async fn wal_backup_launcher_task_main( conf.remote_storage ); - let conf_ = conf.clone(); - REMOTE_STORAGE.get_or_init(|| { - conf_ - .remote_storage - .as_ref() - .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage")) - }); - // Presence in this map means launcher is aware s3 offloading is needed for // the timeline, but task is started only if it makes sense for to offload // from this safekeeper. @@ -518,6 +520,35 @@ async fn backup_object( .await } +pub(crate) async fn backup_partial_segment( + source_file: &Utf8Path, + target_file: &RemotePath, + size: usize, +) -> Result<()> { + let storage = get_configured_remote_storage(); + + let file = File::open(&source_file) + .await + .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?; + + // limiting the file to read only the first `size` bytes + let limited_file = tokio::io::AsyncReadExt::take(file, size as u64); + + let file = tokio_util::io::ReaderStream::with_capacity(limited_file, BUFFER_SIZE); + + let cancel = CancellationToken::new(); + + storage + .upload( + file, + size, + target_file, + Some(StorageMetadata::from([("sk_type", "partial_segment")])), + &cancel, + ) + .await +} + pub async fn read_object( file_path: &RemotePath, offset: u64, @@ -604,6 +635,13 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { Ok(()) } +/// Used by wal_backup_partial. +pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> { + let cancel = CancellationToken::new(); // not really used + let storage = get_configured_remote_storage(); + storage.delete_objects(paths, &cancel).await +} + /// Copy segments from one timeline to another. Used in copy_timeline. pub async fn copy_s3_segments( wal_seg_size: usize, diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs new file mode 100644 index 0000000000..200096ac5c --- /dev/null +++ b/safekeeper/src/wal_backup_partial.rs @@ -0,0 +1,407 @@ +//! Safekeeper timeline has a background task which is subscribed to `commit_lsn` +//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn` +//! was changed), the segment will be uploaded to S3 in about 15 minutes. +//! +//! The filename format for partial segments is +//! `Segment_Term_Flush_Commit_skNN.partial`, where: +//! - `Segment` – the segment name, like `000000010000000000000001` +//! - `Term` – current term +//! - `Flush` – flush_lsn in hex format `{:016X}`, e.g. `00000000346BC568` +//! - `Commit` – commit_lsn in the same hex format +//! - `NN` – safekeeper_id, like `1` +//! +//! The full object name example: +//! `000000010000000000000002_2_0000000002534868_0000000002534410_sk1.partial` +//! +//! Each safekeeper will keep info about remote partial segments in its control +//! file. Code updates state in the control file before doing any S3 operations. +//! This way control file stores information about all potentially existing +//! remote partial segments and can clean them up after uploading a newer version. + +use std::sync::Arc; + +use camino::Utf8PathBuf; +use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; +use rand::Rng; +use remote_storage::RemotePath; +use serde::{Deserialize, Serialize}; + +use tracing::{debug, error, info, instrument}; +use utils::lsn::Lsn; + +use crate::{ + metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, + safekeeper::Term, + timeline::Timeline, + wal_backup, SafeKeeperConf, +}; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum UploadStatus { + /// Upload is in progress + InProgress, + /// Upload is finished + Uploaded, + /// Deletion is in progress + Deleting, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct PartialRemoteSegment { + pub status: UploadStatus, + pub name: String, + pub commit_lsn: Lsn, + pub flush_lsn: Lsn, + pub term: Term, +} + +impl PartialRemoteSegment { + fn eq_without_status(&self, other: &Self) -> bool { + self.name == other.name + && self.commit_lsn == other.commit_lsn + && self.flush_lsn == other.flush_lsn + && self.term == other.term + } +} + +// NB: these structures are a part of a control_file, you can't change them without +// changing the control file format version. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)] +pub struct State { + pub segments: Vec, +} + +impl State { + /// Find an Uploaded segment. There should be only one Uploaded segment at a time. + fn uploaded_segment(&self) -> Option { + self.segments + .iter() + .find(|seg| seg.status == UploadStatus::Uploaded) + .cloned() + } +} + +struct PartialBackup { + wal_seg_size: usize, + tli: Arc, + conf: SafeKeeperConf, + local_prefix: Utf8PathBuf, + remote_prefix: Utf8PathBuf, + + state: State, +} + +// Read-only methods for getting segment names +impl PartialBackup { + fn segno(&self, lsn: Lsn) -> XLogSegNo { + lsn.segment_number(self.wal_seg_size) + } + + fn segment_name(&self, segno: u64) -> String { + XLogFileName(PG_TLI, segno, self.wal_seg_size) + } + + fn remote_segment_name( + &self, + segno: u64, + term: u64, + commit_lsn: Lsn, + flush_lsn: Lsn, + ) -> String { + format!( + "{}_{}_{:016X}_{:016X}_sk{}.partial", + self.segment_name(segno), + term, + flush_lsn.0, + commit_lsn.0, + self.conf.my_id.0, + ) + } + + fn local_segment_name(&self, segno: u64) -> String { + format!("{}.partial", self.segment_name(segno)) + } +} + +impl PartialBackup { + /// Takes a lock to read actual safekeeper state and returns a segment that should be uploaded. + async fn prepare_upload(&self) -> PartialRemoteSegment { + // this operation takes a lock to get the actual state + let sk_info = self.tli.get_safekeeper_info(&self.conf).await; + let flush_lsn = Lsn(sk_info.flush_lsn); + let commit_lsn = Lsn(sk_info.commit_lsn); + let term = sk_info.term; + let segno = self.segno(flush_lsn); + + let name = self.remote_segment_name(segno, term, commit_lsn, flush_lsn); + + PartialRemoteSegment { + status: UploadStatus::InProgress, + name, + commit_lsn, + flush_lsn, + term, + } + } + + /// Reads segment from disk and uploads it to the remote storage. + async fn upload_segment(&mut self, prepared: PartialRemoteSegment) -> anyhow::Result<()> { + let flush_lsn = prepared.flush_lsn; + let segno = self.segno(flush_lsn); + + // We're going to backup bytes from the start of the segment up to flush_lsn. + let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size); + + let local_path = self.local_prefix.join(self.local_segment_name(segno)); + let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?; + + // Upload first `backup_bytes` bytes of the segment to the remote storage. + wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?; + PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64); + + // We uploaded the segment, now let's verify that the data is still actual. + // If the term changed, we cannot guarantee the validity of the uploaded data. + // If the term is the same, we know the data is not corrupted. + let sk_info = self.tli.get_safekeeper_info(&self.conf).await; + if sk_info.term != prepared.term { + anyhow::bail!("term changed during upload"); + } + assert!(prepared.commit_lsn <= Lsn(sk_info.commit_lsn)); + assert!(prepared.flush_lsn <= Lsn(sk_info.flush_lsn)); + + Ok(()) + } + + /// Write new state to disk. If in-memory and on-disk states diverged, returns an error. + async fn commit_state(&mut self, new_state: State) -> anyhow::Result<()> { + self.tli + .map_control_file(|cf| { + if cf.partial_backup != self.state { + let memory = self.state.clone(); + self.state = cf.partial_backup.clone(); + anyhow::bail!( + "partial backup state diverged, memory={:?}, disk={:?}", + memory, + cf.partial_backup + ); + } + + cf.partial_backup = new_state.clone(); + Ok(()) + }) + .await?; + // update in-memory state + self.state = new_state; + Ok(()) + } + + /// Upload the latest version of the partial segment and garbage collect older versions. + #[instrument(name = "upload", skip_all, fields(name = %prepared.name))] + async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> { + info!("starting upload {:?}", prepared); + + let state_0 = self.state.clone(); + let state_1 = { + let mut state = state_0.clone(); + state.segments.push(prepared.clone()); + state + }; + + // we're going to upload a new segment, let's write it to disk to make GC later + self.commit_state(state_1).await?; + + self.upload_segment(prepared.clone()).await?; + + let state_2 = { + let mut state = state_0.clone(); + for seg in state.segments.iter_mut() { + seg.status = UploadStatus::Deleting; + } + let mut actual_remote_segment = prepared.clone(); + actual_remote_segment.status = UploadStatus::Uploaded; + state.segments.push(actual_remote_segment); + state + }; + + // we've uploaded new segment, it's actual, all other segments should be GCed + self.commit_state(state_2).await?; + self.gc().await?; + + Ok(()) + } + + /// Delete all non-Uploaded segments from the remote storage. There should be only one + /// Uploaded segment at a time. + #[instrument(name = "gc", skip_all)] + async fn gc(&mut self) -> anyhow::Result<()> { + let mut segments_to_delete = vec![]; + + let new_segments: Vec = self + .state + .segments + .iter() + .filter_map(|seg| { + if seg.status == UploadStatus::Uploaded { + Some(seg.clone()) + } else { + segments_to_delete.push(seg.name.clone()); + None + } + }) + .collect(); + + info!("deleting objects: {:?}", segments_to_delete); + let mut objects_to_delete = vec![]; + for seg in segments_to_delete.iter() { + let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?; + objects_to_delete.push(remote_path); + } + + // removing segments from remote storage + wal_backup::delete_objects(&objects_to_delete).await?; + + // now we can update the state on disk + let new_state = { + let mut state = self.state.clone(); + state.segments = new_segments; + state + }; + self.commit_state(new_state).await?; + + Ok(()) + } +} + +#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))] +pub async fn main_task(tli: Arc, conf: SafeKeeperConf) { + debug!("started"); + let await_duration = conf.partial_backup_timeout; + + let mut cancellation_rx = match tli.get_cancellation_rx() { + Ok(rx) => rx, + Err(_) => { + info!("timeline canceled during task start"); + return; + } + }; + + // sleep for random time to avoid thundering herd + { + let randf64 = rand::thread_rng().gen_range(0.0..1.0); + let sleep_duration = await_duration.mul_f64(randf64); + tokio::time::sleep(sleep_duration).await; + } + + let (_, persistent_state) = tli.get_state().await; + let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx(); + let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx(); + let wal_seg_size = tli.get_wal_seg_size().await; + + let local_prefix = tli.timeline_dir.clone(); + let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) { + Ok(path) => path.to_owned(), + Err(e) => { + error!("failed to strip workspace dir prefix: {:?}", e); + return; + } + }; + + let mut backup = PartialBackup { + wal_seg_size, + tli, + state: persistent_state.partial_backup, + conf, + local_prefix, + remote_prefix, + }; + + debug!("state: {:?}", backup.state); + + 'outer: loop { + // wait until we have something to upload + let uploaded_segment = backup.state.uploaded_segment(); + if let Some(seg) = &uploaded_segment { + // if we already uploaded something, wait until we have something new + while flush_lsn_rx.borrow().lsn == seg.flush_lsn + && *commit_lsn_rx.borrow() == seg.commit_lsn + && flush_lsn_rx.borrow().term == seg.term + { + tokio::select! { + _ = cancellation_rx.changed() => { + info!("timeline canceled"); + return; + } + _ = commit_lsn_rx.changed() => {} + _ = flush_lsn_rx.changed() => {} + } + } + } + + // if we don't have any data and zero LSNs, wait for something + while flush_lsn_rx.borrow().lsn == Lsn(0) { + tokio::select! { + _ = cancellation_rx.changed() => { + info!("timeline canceled"); + return; + } + _ = flush_lsn_rx.changed() => {} + } + } + + // fixing the segno and waiting some time to prevent reuploading the same segment too often + let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn); + let timeout = tokio::time::sleep(await_duration); + tokio::pin!(timeout); + let mut timeout_expired = false; + + // waiting until timeout expires OR segno changes + 'inner: loop { + tokio::select! { + _ = cancellation_rx.changed() => { + info!("timeline canceled"); + return; + } + _ = commit_lsn_rx.changed() => {} + _ = flush_lsn_rx.changed() => { + let segno = backup.segno(flush_lsn_rx.borrow().lsn); + if segno != pending_segno { + // previous segment is no longer partial, aborting the wait + break 'inner; + } + } + _ = &mut timeout => { + // timeout expired, now we are ready for upload + timeout_expired = true; + break 'inner; + } + } + } + + if !timeout_expired { + // likely segno has changed, let's try again in the next iteration + continue 'outer; + } + + let prepared = backup.prepare_upload().await; + if let Some(seg) = &uploaded_segment { + if seg.eq_without_status(&prepared) { + // we already uploaded this segment, nothing to do + continue 'outer; + } + } + + match backup.do_upload(&prepared).await { + Ok(()) => { + debug!( + "uploaded {} up to flush_lsn {}", + prepared.name, prepared.flush_lsn + ); + PARTIAL_BACKUP_UPLOADS.with_label_values(&["ok"]).inc(); + } + Err(e) => { + info!("failed to upload {}: {:#}", prepared.name, e); + PARTIAL_BACKUP_UPLOADS.with_label_values(&["error"]).inc(); + } + } + } +} diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index e3aaf5d391..bc21c4d765 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -176,6 +176,8 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { http_auth: None, current_thread_runtime: false, walsenders_keep_horizon: false, + partial_backup_enabled: false, + partial_backup_timeout: Duration::from_secs(0), }; let mut global = GlobalMap::new(disk, conf.clone())?; diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 980f343047..84b69cb36a 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -64,14 +64,14 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: Returns basepath for files with captured output. """ assert isinstance(cmd, list) - base = os.path.basename(cmd[0]) + "_{}".format(global_counter()) + base = f"{os.path.basename(cmd[0])}_{global_counter()}" basepath = os.path.join(capture_dir, base) stdout_filename = basepath + ".stdout" stderr_filename = basepath + ".stderr" with open(stdout_filename, "w") as stdout_f: with open(stderr_filename, "w") as stderr_f: - print('(capturing output to "{}.stdout")'.format(base)) + print(f'(capturing output to "{base}.stdout")') subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) return basepath @@ -82,11 +82,9 @@ class PgBin: def __init__(self, log_dir: Path, pg_distrib_dir, pg_version): self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") + self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin") self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = os.path.join( - str(pg_distrib_dir), "v{}".format(pg_version), "lib" - ) + self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib") def _fixpath(self, command: List[str]): if "/" not in command[0]: @@ -110,7 +108,7 @@ class PgBin: """ self._fixpath(command) - print('Running command "{}"'.format(" ".join(command))) + print(f'Running command "{" ".join(command)}"') env = self._build_env(env) subprocess.run(command, env=env, cwd=cwd, check=True) @@ -128,7 +126,7 @@ class PgBin: """ self._fixpath(command) - print('Running command "{}"'.format(" ".join(command))) + print(f'Running command "{" ".join(command)}"') env = self._build_env(env) return subprocess_capture( str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs @@ -300,7 +298,7 @@ class NeonPageserverHttpClient(requests.Session): def lsn_to_hex(num: int) -> str: """Convert lsn from int to standard hex notation.""" - return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF) + return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}" def lsn_from_hex(lsn_hex: str) -> int: @@ -331,16 +329,12 @@ def wait_for_upload( if current_lsn >= lsn: return print( - "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 - ) + f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}" ) time.sleep(1) raise Exception( - "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn) - ) + f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}" ) diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index 853c67d218..878840fcee 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -15,8 +15,7 @@ FLAKY_TESTS_QUERY = """ DISTINCT parent_suite, suite, name FROM results WHERE - started_at > CURRENT_DATE - INTERVAL '10' day - AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs` + started_at > CURRENT_DATE - INTERVAL '%s' day AND ( (status IN ('failed', 'broken') AND reference = 'refs/heads/main') OR flaky diff --git a/scripts/sk_cleanup_tenants/script.py b/scripts/sk_cleanup_tenants/script.py index fa22433614..c20a4bb830 100644 --- a/scripts/sk_cleanup_tenants/script.py +++ b/scripts/sk_cleanup_tenants/script.py @@ -22,7 +22,7 @@ parser.add_argument("--safekeeper-host", required=True, type=str) args = parser.parse_args() access_key = os.getenv("CONSOLE_API_TOKEN") -endpoint: str = "https://console.stage.neon.tech/api" +endpoint: str = "https://console-stage.neon.build/api" trash_dir: Path = args.trash_dir dry_run: bool = args.dry_run diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md index 7494a6cb78..5ae55e058b 100644 --- a/scripts/sk_collect_dumps/readme.md +++ b/scripts/sk_collect_dumps/readme.md @@ -3,7 +3,7 @@ 3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key): ``` # staging: -AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') +AUTH_TOKEN=$(curl https://console-stage.neon.build/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') # prod: AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') # check diff --git a/control_plane/attachment_service/Cargo.toml b/storage_controller/Cargo.toml similarity index 81% rename from control_plane/attachment_service/Cargo.toml rename to storage_controller/Cargo.toml index 0201e0ed86..165cafaf4e 100644 --- a/control_plane/attachment_service/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "attachment_service" +name = "storage_controller" version = "0.1.0" edition.workspace = true license.workspace = true @@ -25,6 +25,7 @@ git-version.workspace = true hex.workspace = true hyper.workspace = true humantime.workspace = true +itertools.workspace = true lasso.workspace = true once_cell.workspace = true pageserver_api.workspace = true @@ -44,8 +45,8 @@ diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] } diesel_migrations = { version = "2.1.0" } r2d2 = { version = "0.8.10" } -utils = { path = "../../libs/utils/" } -metrics = { path = "../../libs/metrics/" } -control_plane = { path = ".." } -workspace_hack = { version = "0.1", path = "../../workspace_hack" } +utils = { path = "../libs/utils/" } +metrics = { path = "../libs/metrics/" } +control_plane = { path = "../control_plane" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/attachment_service/migrations/.keep b/storage_controller/migrations/.keep similarity index 100% rename from control_plane/attachment_service/migrations/.keep rename to storage_controller/migrations/.keep diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql b/storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql rename to storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql b/storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql rename to storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql rename to storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql rename to storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql b/storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql rename to storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql b/storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql rename to storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql b/storage_controller/migrations/2024-02-29-094122_generations_null/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql rename to storage_controller/migrations/2024-02-29-094122_generations_null/down.sql diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql b/storage_controller/migrations/2024-02-29-094122_generations_null/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql rename to storage_controller/migrations/2024-02-29-094122_generations_null/up.sql diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql b/storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql rename to storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql b/storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql rename to storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql diff --git a/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql b/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql new file mode 100644 index 0000000000..33c06dc03d --- /dev/null +++ b/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql @@ -0,0 +1,3 @@ +-- This file should undo anything in `up.sql` + +ALTER TABLE tenant_shards drop scheduling_policy; \ No newline at end of file diff --git a/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql b/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql new file mode 100644 index 0000000000..aa00f0d2ca --- /dev/null +++ b/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql @@ -0,0 +1,2 @@ + +ALTER TABLE tenant_shards add scheduling_policy VARCHAR NOT NULL DEFAULT '"Active"'; diff --git a/control_plane/attachment_service/src/auth.rs b/storage_controller/src/auth.rs similarity index 100% rename from control_plane/attachment_service/src/auth.rs rename to storage_controller/src/auth.rs diff --git a/control_plane/attachment_service/src/compute_hook.rs b/storage_controller/src/compute_hook.rs similarity index 57% rename from control_plane/attachment_service/src/compute_hook.rs rename to storage_controller/src/compute_hook.rs index bebc62ac2f..1ed8998713 100644 --- a/control_plane/attachment_service/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -1,3 +1,4 @@ +use std::sync::Arc; use std::{collections::HashMap, time::Duration}; use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; @@ -14,19 +15,32 @@ use utils::{ use crate::service::Config; -const BUSY_DELAY: Duration = Duration::from_secs(1); const SLOWDOWN_DELAY: Duration = Duration::from_secs(5); +const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); + pub(crate) const API_CONCURRENCY: usize = 32; +struct UnshardedComputeHookTenant { + // Which node is this tenant attached to + node_id: NodeId, + + // Must hold this lock to send a notification. + send_lock: Arc>>, +} struct ShardedComputeHookTenant { stripe_size: ShardStripeSize, shard_count: ShardCount, shards: Vec<(ShardNumber, NodeId)>, + + // Must hold this lock to send a notification. The contents represent + // the last successfully sent notification, and are used to coalesce multiple + // updates by only sending when there is a chance since our last successful send. + send_lock: Arc>>, } enum ComputeHookTenant { - Unsharded(NodeId), + Unsharded(UnshardedComputeHookTenant), Sharded(ShardedComputeHookTenant), } @@ -38,9 +52,20 @@ impl ComputeHookTenant { shards: vec![(tenant_shard_id.shard_number, node_id)], stripe_size, shard_count: tenant_shard_id.shard_count, + send_lock: Arc::default(), }) } else { - Self::Unsharded(node_id) + Self::Unsharded(UnshardedComputeHookTenant { + node_id, + send_lock: Arc::default(), + }) + } + } + + fn get_send_lock(&self) -> &Arc>> { + match self { + Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock, + Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock, } } @@ -53,8 +78,8 @@ impl ComputeHookTenant { node_id: NodeId, ) { match self { - Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => { - *existing_node_id = node_id + Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => { + unsharded_tenant.node_id = node_id } Self::Sharded(sharded_tenant) if sharded_tenant.stripe_size == stripe_size @@ -81,14 +106,14 @@ impl ComputeHookTenant { } } -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] struct ComputeHookNotifyRequestShard { node_id: NodeId, shard_number: ShardNumber, } /// Request body that we send to the control plane to notify it of where a tenant is attached -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] struct ComputeHookNotifyRequest { tenant_id: TenantId, stripe_size: Option, @@ -121,14 +146,44 @@ pub(crate) enum NotifyError { Fatal(StatusCode), } +enum MaybeSendResult { + // Please send this request while holding the lock, and if you succeed then write + // the request into the lock. + Transmit( + ( + ComputeHookNotifyRequest, + tokio::sync::OwnedMutexGuard>, + ), + ), + // Something requires sending, but you must wait for a current sender then call again + AwaitLock(Arc>>), + // Nothing requires sending + Noop, +} + impl ComputeHookTenant { - fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option { - match self { - Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest { + fn maybe_send( + &self, + tenant_id: TenantId, + lock: Option>>, + ) -> MaybeSendResult { + let locked = match lock { + Some(already_locked) => already_locked, + None => { + // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock. + let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else { + return MaybeSendResult::AwaitLock(self.get_send_lock().clone()); + }; + locked + } + }; + + let request = match self { + Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest { tenant_id, shards: vec![ComputeHookNotifyRequestShard { shard_number: ShardNumber(0), - node_id: *node_id, + node_id: unsharded_tenant.node_id, }], stripe_size: None, }), @@ -152,12 +207,25 @@ impl ComputeHookTenant { // Sharded tenant doesn't yet have information for all its shards tracing::info!( - "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})", + "ComputeHookTenant::maybe_send: not enough shards ({}/{})", sharded_tenant.shards.len(), sharded_tenant.shard_count.count() ); None } + }; + + match request { + None => { + // Not yet ready to emit a notification + tracing::info!("Tenant isn't yet ready to emit a notification"); + MaybeSendResult::Noop + } + Some(request) if Some(&request) == locked.as_ref() => { + // No change from the last value successfully sent + MaybeSendResult::Noop + } + Some(request) => MaybeSendResult::Transmit((request, locked)), } } } @@ -167,8 +235,19 @@ impl ComputeHookTenant { /// the compute connection string. pub(super) struct ComputeHook { config: Config, - state: tokio::sync::Mutex>, + state: std::sync::Mutex>, authorization_header: Option, + + // Concurrency limiter, so that we do not overload the cloud control plane when updating + // large numbers of tenants (e.g. when failing over after a node failure) + api_concurrency: tokio::sync::Semaphore, + + // This lock is only used in testing enviroments, to serialize calls into neon_lock + neon_local_lock: tokio::sync::Mutex<()>, + + // We share a client across all notifications to enable connection re-use etc when + // sending large numbers of notifications + client: reqwest::Client, } impl ComputeHook { @@ -178,18 +257,30 @@ impl ComputeHook { .clone() .map(|jwt| format!("Bearer {}", jwt)); + let client = reqwest::ClientBuilder::new() + .timeout(NOTIFY_REQUEST_TIMEOUT) + .build() + .expect("Failed to construct HTTP client"); + Self { state: Default::default(), config, authorization_header, + neon_local_lock: Default::default(), + api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), + client, } } /// For test environments: use neon_local's LocalEnv to update compute async fn do_notify_local( &self, - reconfigure_request: ComputeHookNotifyRequest, + reconfigure_request: &ComputeHookNotifyRequest, ) -> anyhow::Result<()> { + // neon_local updates are not safe to call concurrently, use a lock to serialize + // all calls to this function + let _locked = self.neon_local_lock.lock().await; + let env = match LocalEnv::load_config() { Ok(e) => e, Err(e) => { @@ -206,7 +297,7 @@ impl ComputeHook { } = reconfigure_request; let compute_pageservers = shards - .into_iter() + .iter() .map(|shard| { let ps_conf = env .get_pageserver_conf(shard.node_id) @@ -218,10 +309,10 @@ impl ComputeHook { .collect::>(); for (endpoint_name, endpoint) in &cplane.endpoints { - if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running { + if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running { tracing::info!("Reconfiguring endpoint {}", endpoint_name,); endpoint - .reconfigure(compute_pageservers.clone(), stripe_size) + .reconfigure(compute_pageservers.clone(), *stripe_size) .await?; } } @@ -231,12 +322,11 @@ impl ComputeHook { async fn do_notify_iteration( &self, - client: &reqwest::Client, url: &String, reconfigure_request: &ComputeHookNotifyRequest, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let req = client.request(Method::PUT, url); + let req = self.client.request(Method::PUT, url); let req = if let Some(value) = &self.authorization_header { req.header(reqwest::header::AUTHORIZATION, value) } else { @@ -280,11 +370,10 @@ impl ComputeHook { Err(NotifyError::SlowDown) } StatusCode::LOCKED => { - // Delay our retry if busy: the usual fast exponential backoff in backoff::retry - // is not appropriate - tokio::time::timeout(BUSY_DELAY, cancel.cancelled()) - .await - .ok(); + // We consider this fatal, because it's possible that the operation blocking the control one is + // also the one that is waiting for this reconcile. We should let the reconciler calling + // this hook fail, to give control plane a chance to un-lock. + tracing::info!("Control plane reports tenant is locked, dropping out of notify"); Err(NotifyError::Busy) } StatusCode::SERVICE_UNAVAILABLE @@ -300,13 +389,27 @@ impl ComputeHook { async fn do_notify( &self, url: &String, - reconfigure_request: ComputeHookNotifyRequest, + reconfigure_request: &ComputeHookNotifyRequest, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let client = reqwest::Client::new(); + // We hold these semaphore units across all retries, rather than only across each + // HTTP request: this is to preserve fairness and avoid a situation where a retry might + // time out waiting for a semaphore. + let _units = self + .api_concurrency + .acquire() + .await + // Interpret closed semaphore as shutdown + .map_err(|_| NotifyError::ShuttingDown)?; + backoff::retry( - || self.do_notify_iteration(&client, url, &reconfigure_request, cancel), - |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)), + || self.do_notify_iteration(url, reconfigure_request, cancel), + |e| { + matches!( + e, + NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy + ) + }, 3, 10, "Send compute notification", @@ -340,42 +443,70 @@ impl ComputeHook { stripe_size: ShardStripeSize, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let mut locked = self.state.lock().await; + let maybe_send_result = { + let mut state_locked = self.state.lock().unwrap(); - use std::collections::hash_map::Entry; - let tenant = match locked.entry(tenant_shard_id.tenant_id) { - Entry::Vacant(e) => e.insert(ComputeHookTenant::new( - tenant_shard_id, - stripe_size, - node_id, - )), - Entry::Occupied(e) => { - let tenant = e.into_mut(); - tenant.update(tenant_shard_id, stripe_size, node_id); - tenant + use std::collections::hash_map::Entry; + let tenant = match state_locked.entry(tenant_shard_id.tenant_id) { + Entry::Vacant(e) => e.insert(ComputeHookTenant::new( + tenant_shard_id, + stripe_size, + node_id, + )), + Entry::Occupied(e) => { + let tenant = e.into_mut(); + tenant.update(tenant_shard_id, stripe_size, node_id); + tenant + } + }; + tenant.maybe_send(tenant_shard_id.tenant_id, None) + }; + + // Process result: we may get an update to send, or we may have to wait for a lock + // before trying again. + let (request, mut send_lock_guard) = match maybe_send_result { + MaybeSendResult::Noop => { + return Ok(()); } + MaybeSendResult::AwaitLock(send_lock) => { + let send_locked = send_lock.lock_owned().await; + + // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here + // we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses + // try_lock. + let state_locked = self.state.lock().unwrap(); + let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else { + return Ok(()); + }; + match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) { + MaybeSendResult::AwaitLock(_) => { + unreachable!("We supplied lock guard") + } + MaybeSendResult::Noop => { + return Ok(()); + } + MaybeSendResult::Transmit((request, lock)) => (request, lock), + } + } + MaybeSendResult::Transmit((request, lock)) => (request, lock), }; - let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id); - let Some(reconfigure_request) = reconfigure_request else { - // The tenant doesn't yet have pageservers for all its shards: we won't notify anything - // until it does. - tracing::info!("Tenant isn't yet ready to emit a notification"); - return Ok(()); - }; - - if let Some(notify_url) = &self.config.compute_hook_url { - self.do_notify(notify_url, reconfigure_request, cancel) - .await + let result = if let Some(notify_url) = &self.config.compute_hook_url { + self.do_notify(notify_url, &request, cancel).await } else { - self.do_notify_local(reconfigure_request) - .await - .map_err(|e| { - // This path is for testing only, so munge the error into our prod-style error type. - tracing::error!("Local notification hook failed: {e}"); - NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) - }) + self.do_notify_local(&request).await.map_err(|e| { + // This path is for testing only, so munge the error into our prod-style error type. + tracing::error!("Local notification hook failed: {e}"); + NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) + }) + }; + + if result.is_ok() { + // Before dropping the send lock, stash the request we just sent so that + // subsequent callers can avoid redundantly re-sending the same thing. + *send_lock_guard = Some(request); } + result } } @@ -399,21 +530,22 @@ pub(crate) mod tests { NodeId(1), ); - // An unsharded tenant is always ready to emit a notification - assert!(tenant_state.maybe_reconfigure(tenant_id).is_some()); - assert_eq!( - tenant_state - .maybe_reconfigure(tenant_id) - .unwrap() - .shards - .len(), - 1 - ); - assert!(tenant_state - .maybe_reconfigure(tenant_id) - .unwrap() - .stripe_size - .is_none()); + // An unsharded tenant is always ready to emit a notification, but won't + // send the same one twice + let send_result = tenant_state.maybe_send(tenant_id, None); + let MaybeSendResult::Transmit((request, mut guard)) = send_result else { + anyhow::bail!("Wrong send result"); + }; + assert_eq!(request.shards.len(), 1); + assert!(request.stripe_size.is_none()); + + // Simulate successful send + *guard = Some(request); + drop(guard); + + // Try asking again: this should be a no-op + let send_result = tenant_state.maybe_send(tenant_id, None); + assert!(matches!(send_result, MaybeSendResult::Noop)); // Writing the first shard of a multi-sharded situation (i.e. in a split) // resets the tenant state and puts it in an non-notifying state (need to @@ -427,7 +559,10 @@ pub(crate) mod tests { ShardStripeSize(32768), NodeId(1), ); - assert!(tenant_state.maybe_reconfigure(tenant_id).is_none()); + assert!(matches!( + tenant_state.maybe_send(tenant_id, None), + MaybeSendResult::Noop + )); // Writing the second shard makes it ready to notify tenant_state.update( @@ -440,22 +575,16 @@ pub(crate) mod tests { NodeId(1), ); - assert!(tenant_state.maybe_reconfigure(tenant_id).is_some()); - assert_eq!( - tenant_state - .maybe_reconfigure(tenant_id) - .unwrap() - .shards - .len(), - 2 - ); - assert_eq!( - tenant_state - .maybe_reconfigure(tenant_id) - .unwrap() - .stripe_size, - Some(ShardStripeSize(32768)) - ); + let send_result = tenant_state.maybe_send(tenant_id, None); + let MaybeSendResult::Transmit((request, mut guard)) = send_result else { + anyhow::bail!("Wrong send result"); + }; + assert_eq!(request.shards.len(), 2); + assert_eq!(request.stripe_size, Some(ShardStripeSize(32768))); + + // Simulate successful send + *guard = Some(request); + drop(guard); Ok(()) } diff --git a/control_plane/attachment_service/src/heartbeater.rs b/storage_controller/src/heartbeater.rs similarity index 100% rename from control_plane/attachment_service/src/heartbeater.rs rename to storage_controller/src/heartbeater.rs diff --git a/control_plane/attachment_service/src/http.rs b/storage_controller/src/http.rs similarity index 91% rename from control_plane/attachment_service/src/http.rs rename to storage_controller/src/http.rs index 036019cd38..2e83bbc5ed 100644 --- a/control_plane/attachment_service/src/http.rs +++ b/storage_controller/src/http.rs @@ -8,6 +8,7 @@ use futures::Future; use hyper::header::CONTENT_TYPE; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; +use metrics::{BuildInfo, NeonMetrics}; use pageserver_api::models::{ TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, TimelineCreateRequest, @@ -34,7 +35,8 @@ use utils::{ }; use pageserver_api::controller_api::{ - NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest, + NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest, + TenantShardMigrateRequest, }; use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; @@ -43,15 +45,19 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; use routerify::Middleware; /// State available to HTTP request handlers -#[derive(Clone)] pub struct HttpState { service: Arc, auth: Option>, + neon_metrics: NeonMetrics, allowlist_routes: Vec, } impl HttpState { - pub fn new(service: Arc, auth: Option>) -> Self { + pub fn new( + service: Arc, + auth: Option>, + build_info: BuildInfo, + ) -> Self { let allowlist_routes = ["/status", "/ready", "/metrics"] .iter() .map(|v| v.parse().unwrap()) @@ -59,6 +65,7 @@ impl HttpState { Self { service, auth, + neon_metrics: NeonMetrics::new(build_info), allowlist_routes, } } @@ -398,6 +405,15 @@ async fn handle_tenant_describe( json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) } +async fn handle_tenant_list( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + json_response(StatusCode::OK, service.tenant_list()) +} + async fn handle_node_register(mut req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -411,7 +427,10 @@ async fn handle_node_list(req: Request) -> Result, ApiError check_permissions(&req, Scope::Admin)?; let state = get_state(&req); - json_response(StatusCode::OK, state.service.node_list().await?) + let nodes = state.service.node_list().await?; + let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::>(); + + json_response(StatusCode::OK, api_nodes) } async fn handle_node_drop(req: Request) -> Result, ApiError> { @@ -478,6 +497,22 @@ async fn handle_tenant_shard_migrate( ) } +async fn handle_tenant_update_policy(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let update_req = json_request::(&mut req).await?; + let state = get_state(&req); + + json_response( + StatusCode::OK, + state + .service + .tenant_update_policy(tenant_id, update_req) + .await?, + ) +} + async fn handle_tenant_drop(req: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; @@ -509,6 +544,14 @@ async fn handle_consistency_check(req: Request) -> Result, json_response(StatusCode::OK, state.service.consistency_check().await?) } +async fn handle_reconcile_all(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + + json_response(StatusCode::OK, state.service.reconcile_all_now().await?) +} + /// Status endpoint is just used for checking that our HTTP listener is up async fn handle_status(_req: Request) -> Result, ApiError> { json_response(StatusCode::OK, ()) @@ -565,9 +608,17 @@ where .await } +/// Check if the required scope is held in the request's token, or if the request has +/// a token with 'admin' scope then always permit it. fn check_permissions(request: &Request, required_scope: Scope) -> Result<(), ApiError> { check_permission_with(request, |claims| { - crate::auth::check_permission(claims, required_scope) + match crate::auth::check_permission(claims, required_scope) { + Err(e) => match crate::auth::check_permission(claims, Scope::Admin) { + Ok(()) => Ok(()), + Err(_) => Err(e), + }, + Ok(()) => Ok(()), + } }) } @@ -627,10 +678,11 @@ fn epilogue_metrics_middleware }) } -pub async fn measured_metrics_handler(_req: Request) -> Result, ApiError> { +pub async fn measured_metrics_handler(req: Request) -> Result, ApiError> { pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; - let payload = crate::metrics::METRICS_REGISTRY.encode(); + let state = get_state(&req); + let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics); let response = Response::builder() .status(200) .header(CONTENT_TYPE, TEXT_FORMAT) @@ -659,6 +711,7 @@ where pub fn make_router( service: Arc, auth: Option>, + build_info: BuildInfo, ) -> RouterBuilder { let mut router = endpoint::make_router() .middleware(prologue_metrics_middleware()) @@ -675,7 +728,7 @@ pub fn make_router( } router - .data(Arc::new(HttpState::new(service, auth))) + .data(Arc::new(HttpState::new(service, auth, build_info))) .get("/metrics", |r| { named_request_span(r, measured_metrics_handler, RequestName("metrics")) }) @@ -726,6 +779,9 @@ pub fn make_router( RequestName("debug_v1_consistency_check"), ) }) + .post("/debug/v1/reconcile_all", |r| { + request_span(r, handle_reconcile_all) + }) .put("/debug/v1/failpoints", |r| { request_span(r, |r| failpoints_handler(r, CancellationToken::new())) }) @@ -765,6 +821,16 @@ pub fn make_router( RequestName("control_v1_tenant_describe"), ) }) + .get("/control/v1/tenant", |r| { + tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list")) + }) + .put("/control/v1/tenant/:tenant_id/policy", |r| { + named_request_span( + r, + handle_tenant_update_policy, + RequestName("control_v1_tenant_policy"), + ) + }) // Tenant operations // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. diff --git a/control_plane/attachment_service/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs similarity index 100% rename from control_plane/attachment_service/src/id_lock_map.rs rename to storage_controller/src/id_lock_map.rs diff --git a/control_plane/attachment_service/src/lib.rs b/storage_controller/src/lib.rs similarity index 98% rename from control_plane/attachment_service/src/lib.rs rename to storage_controller/src/lib.rs index 8bcd5c0ac4..2ea490a14b 100644 --- a/control_plane/attachment_service/src/lib.rs +++ b/storage_controller/src/lib.rs @@ -14,7 +14,7 @@ mod reconciler; mod scheduler; mod schema; pub mod service; -mod tenant_state; +mod tenant_shard; #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)] struct Sequence(u64); diff --git a/control_plane/attachment_service/src/main.rs b/storage_controller/src/main.rs similarity index 95% rename from control_plane/attachment_service/src/main.rs rename to storage_controller/src/main.rs index bd8d7f5c59..6466b9f7a3 100644 --- a/control_plane/attachment_service/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,18 +1,20 @@ use anyhow::{anyhow, Context}; -use attachment_service::http::make_router; -use attachment_service::metrics::preinitialize_metrics; -use attachment_service::persistence::Persistence; -use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT}; use camino::Utf8PathBuf; use clap::Parser; use diesel::Connection; use metrics::launch_timestamp::LaunchTimestamp; +use metrics::BuildInfo; use std::sync::Arc; +use storage_controller::http::make_router; +use storage_controller::metrics::preinitialize_metrics; +use storage_controller::persistence::Persistence; +use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT}; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::logging::{self, LogFormat}; +use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); @@ -50,7 +52,7 @@ struct Cli { #[arg(short, long)] path: Option, - /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service + /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller #[arg(long)] database_url: Option, @@ -158,6 +160,8 @@ fn main() -> anyhow::Result<()> { std::process::exit(1); })); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + tokio::runtime::Builder::new_current_thread() // We use spawn_blocking for database operations, so require approximately // as many blocking threads as we will open database connections. @@ -189,6 +193,11 @@ async fn async_main() -> anyhow::Result<()> { args.listen ); + let build_info = BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }; + let strict_mode = if args.dev { StrictMode::Dev } else { @@ -250,7 +259,7 @@ async fn async_main() -> anyhow::Result<()> { let auth = secrets .public_key .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth))); - let router = make_router(service.clone(), auth) + let router = make_router(service.clone(), auth, build_info) .build() .map_err(|err| anyhow!(err))?; let router_service = utils::http::RouterService::new(router).unwrap(); diff --git a/control_plane/attachment_service/src/metrics.rs b/storage_controller/src/metrics.rs similarity index 63% rename from control_plane/attachment_service/src/metrics.rs rename to storage_controller/src/metrics.rs index ccf5e9b07c..ac9f22c739 100644 --- a/control_plane/attachment_service/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -8,10 +8,8 @@ //! The rest of the code defines label group types and deals with converting outer types to labels. //! use bytes::Bytes; -use measured::{ - label::{LabelValue, StaticLabelSet}, - FixedCardinalityLabel, MetricGroup, -}; +use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup}; +use metrics::NeonMetrics; use once_cell::sync::Lazy; use std::sync::Mutex; @@ -26,21 +24,28 @@ pub fn preinitialize_metrics() { pub(crate) struct StorageControllerMetrics { pub(crate) metrics_group: StorageControllerMetricGroup, - encoder: Mutex, + encoder: Mutex, } #[derive(measured::MetricGroup)] +#[metric(new())] pub(crate) struct StorageControllerMetricGroup { /// Count of how many times we spawn a reconcile task pub(crate) storage_controller_reconcile_spawn: measured::Counter, + /// Reconciler tasks completed, broken down by success/failure/cancelled pub(crate) storage_controller_reconcile_complete: measured::CounterVec, + /// Count of how many times we make an optimization change to a tenant's scheduling + pub(crate) storage_controller_schedule_optimization: measured::Counter, + /// HTTP request status counters for handled requests pub(crate) storage_controller_http_request_status: measured::CounterVec, + /// HTTP request handler latency across all status codes + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_http_request_latency: measured::HistogramVec, @@ -52,6 +57,7 @@ pub(crate) struct StorageControllerMetricGroup { /// Latency of HTTP requests to the pageserver, broken down by pageserver /// node id, request name and method. This include both successful and unsuccessful /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_pageserver_request_latency: measured::HistogramVec, @@ -63,6 +69,7 @@ pub(crate) struct StorageControllerMetricGroup { /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver /// node id, request name and method. This include both successful and unsuccessful /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_passthrough_request_latency: measured::HistogramVec, @@ -71,75 +78,34 @@ pub(crate) struct StorageControllerMetricGroup { measured::CounterVec, /// Latency of database queries, broken down by operation. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_database_query_latency: measured::HistogramVec, } impl StorageControllerMetrics { - pub(crate) fn encode(&self) -> Bytes { + pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes { let mut encoder = self.encoder.lock().unwrap(); - self.metrics_group.collect_into(&mut *encoder); + neon_metrics + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + self.metrics_group + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); encoder.finish() } } impl Default for StorageControllerMetrics { fn default() -> Self { - Self { - metrics_group: StorageControllerMetricGroup::new(), - encoder: Mutex::new(measured::text::TextEncoder::new()), - } - } -} + let mut metrics_group = StorageControllerMetricGroup::new(); + metrics_group + .storage_controller_reconcile_complete + .init_all_dense(); -impl StorageControllerMetricGroup { - pub(crate) fn new() -> Self { Self { - storage_controller_reconcile_spawn: measured::Counter::new(), - storage_controller_reconcile_complete: measured::CounterVec::new( - ReconcileCompleteLabelGroupSet { - status: StaticLabelSet::new(), - }, - ), - storage_controller_http_request_status: measured::CounterVec::new( - HttpRequestStatusLabelGroupSet { - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - status: StaticLabelSet::new(), - }, - ), - storage_controller_http_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_pageserver_request_error: measured::CounterVec::new( - PageserverRequestLabelGroupSet { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - }, - ), - storage_controller_pageserver_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_passthrough_request_error: measured::CounterVec::new( - PageserverRequestLabelGroupSet { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - }, - ), - storage_controller_passthrough_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_database_query_error: measured::CounterVec::new( - DatabaseQueryErrorLabelGroupSet { - operation: StaticLabelSet::new(), - error_type: StaticLabelSet::new(), - }, - ), - storage_controller_database_query_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), + metrics_group, + encoder: Mutex::new(measured::text::BufferedTextEncoder::new()), } } } @@ -153,7 +119,7 @@ pub(crate) struct ReconcileCompleteLabelGroup { #[derive(measured::LabelGroup)] #[label(set = HttpRequestStatusLabelGroupSet)] pub(crate) struct HttpRequestStatusLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, pub(crate) status: StatusCode, @@ -162,40 +128,21 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> { #[derive(measured::LabelGroup)] #[label(set = HttpRequestLatencyLabelGroupSet)] pub(crate) struct HttpRequestLatencyLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, } -impl Default for HttpRequestLatencyLabelGroupSet { - fn default() -> Self { - Self { - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - } - } -} - #[derive(measured::LabelGroup, Clone)] #[label(set = PageserverRequestLabelGroupSet)] pub(crate) struct PageserverRequestLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) pageserver_id: &'a str, - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, } -impl Default for PageserverRequestLabelGroupSet { - fn default() -> Self { - Self { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - } - } -} - #[derive(measured::LabelGroup)] #[label(set = DatabaseQueryErrorLabelGroupSet)] pub(crate) struct DatabaseQueryErrorLabelGroup { @@ -209,7 +156,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup { pub(crate) operation: DatabaseOperation, } -#[derive(FixedCardinalityLabel)] +#[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum ReconcileOutcome { #[label(rename = "ok")] Success, @@ -217,7 +164,7 @@ pub(crate) enum ReconcileOutcome { Cancel, } -#[derive(FixedCardinalityLabel, Clone)] +#[derive(FixedCardinalityLabel, Copy, Clone)] pub(crate) enum Method { Get, Put, @@ -242,11 +189,12 @@ impl From for Method { } } +#[derive(Clone, Copy)] pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode); impl LabelValue for StatusCode { fn visit(&self, v: V) -> V::Output { - v.write_int(self.0.as_u16() as u64) + v.write_int(self.0.as_u16() as i64) } } @@ -264,7 +212,7 @@ impl FixedCardinalityLabel for StatusCode { } } -#[derive(FixedCardinalityLabel)] +#[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum DatabaseErrorLabel { Query, Connection, diff --git a/control_plane/attachment_service/src/node.rs b/storage_controller/src/node.rs similarity index 93% rename from control_plane/attachment_service/src/node.rs rename to storage_controller/src/node.rs index df40bff66f..7ba6828deb 100644 --- a/control_plane/attachment_service/src/node.rs +++ b/storage_controller/src/node.rs @@ -3,7 +3,8 @@ use std::{str::FromStr, time::Duration}; use hyper::StatusCode; use pageserver_api::{ controller_api::{ - NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard, + NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, + TenantLocateResponseShard, }, shard::TenantShardId, }; @@ -256,6 +257,19 @@ impl Node { ) .await } + + /// Generate the simplified API-friendly description of a node's state + pub(crate) fn describe(&self) -> NodeDescribeResponse { + NodeDescribeResponse { + id: self.id, + availability: self.availability.into(), + scheduling: self.scheduling, + listen_http_addr: self.listen_http_addr.clone(), + listen_http_port: self.listen_http_port, + listen_pg_addr: self.listen_pg_addr.clone(), + listen_pg_port: self.listen_pg_port, + } + } } impl std::fmt::Display for Node { diff --git a/control_plane/attachment_service/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs similarity index 100% rename from control_plane/attachment_service/src/pageserver_client.rs rename to storage_controller/src/pageserver_client.rs diff --git a/control_plane/attachment_service/src/persistence.rs b/storage_controller/src/persistence.rs similarity index 92% rename from control_plane/attachment_service/src/persistence.rs rename to storage_controller/src/persistence.rs index dafd52017b..5312e1e218 100644 --- a/control_plane/attachment_service/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -9,6 +9,7 @@ use camino::Utf8PathBuf; use diesel::pg::PgConnection; use diesel::prelude::*; use diesel::Connection; +use pageserver_api::controller_api::ShardSchedulingPolicy; use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; use pageserver_api::models::TenantConfig; use pageserver_api::shard::ShardConfigError; @@ -78,7 +79,7 @@ pub(crate) enum DatabaseError { Logical(String), } -#[derive(measured::FixedCardinalityLabel, Clone)] +#[derive(measured::FixedCardinalityLabel, Copy, Clone)] pub(crate) enum DatabaseOperation { InsertNode, UpdateNode, @@ -107,6 +108,12 @@ pub(crate) enum AbortShardSplitStatus { pub(crate) type DatabaseResult = Result; +/// Some methods can operate on either a whole tenant or a single shard +pub(crate) enum TenantFilter { + Tenant(TenantId), + Shard(TenantShardId), +} + impl Persistence { // The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under // normal circumstances. This assumes we have exclusive use of the database cluster to which we connect. @@ -140,15 +147,13 @@ impl Persistence { /// Wraps `with_conn` in order to collect latency and error metrics async fn with_measured_conn(&self, op: DatabaseOperation, func: F) -> DatabaseResult where - F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, + F: FnOnce(&mut PgConnection) -> DatabaseResult + Send + 'static, R: Send + 'static, { let latency = &METRICS_REGISTRY .metrics_group .storage_controller_database_query_latency; - let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { - operation: op.clone(), - }); + let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op }); let res = self.with_conn(func).await; @@ -168,7 +173,7 @@ impl Persistence { /// Call the provided function in a tokio blocking thread, with a Diesel database connection. async fn with_conn(&self, func: F) -> DatabaseResult where - F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, + F: FnOnce(&mut PgConnection) -> DatabaseResult + Send + 'static, R: Send + 'static, { let mut conn = self.connection_pool.get()?; @@ -275,6 +280,11 @@ impl Persistence { // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165 shard.placement_policy = "{\"Attached\":0}".to_string(); } + + if shard.scheduling_policy.is_empty() { + shard.scheduling_policy = + serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap(); + } } let tenants: Vec = decoded.tenants.into_values().collect(); @@ -465,59 +475,45 @@ impl Persistence { /// that we only do the first time a tenant is set to an attached policy via /location_config. pub(crate) async fn update_tenant_shard( &self, - tenant_shard_id: TenantShardId, - input_placement_policy: PlacementPolicy, - input_config: TenantConfig, + tenant: TenantFilter, + input_placement_policy: Option, + input_config: Option, input_generation: Option, + input_scheduling_policy: Option, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| { - let query = diesel::update(tenant_shards) - .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) - .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) - .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)); + let query = match tenant { + TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .into_boxed(), + TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards) + .filter(tenant_id.eq(input_tenant_id.to_string())) + .into_boxed(), + }; - if let Some(input_generation) = input_generation { - // Update includes generation column - query - .set(( - generation.eq(Some(input_generation.into().unwrap() as i32)), - placement_policy - .eq(serde_json::to_string(&input_placement_policy).unwrap()), - config.eq(serde_json::to_string(&input_config).unwrap()), - )) - .execute(conn)?; - } else { - // Update does not include generation column - query - .set(( - placement_policy - .eq(serde_json::to_string(&input_placement_policy).unwrap()), - config.eq(serde_json::to_string(&input_config).unwrap()), - )) - .execute(conn)?; + #[derive(AsChangeset)] + #[diesel(table_name = crate::schema::tenant_shards)] + struct ShardUpdate { + generation: Option, + placement_policy: Option, + config: Option, + scheduling_policy: Option, } - Ok(()) - }) - .await?; + let update = ShardUpdate { + generation: input_generation.map(|g| g.into().unwrap() as i32), + placement_policy: input_placement_policy + .map(|p| serde_json::to_string(&p).unwrap()), + config: input_config.map(|c| serde_json::to_string(&c).unwrap()), + scheduling_policy: input_scheduling_policy + .map(|p| serde_json::to_string(&p).unwrap()), + }; - Ok(()) - } - - pub(crate) async fn update_tenant_config( - &self, - input_tenant_id: TenantId, - input_config: TenantConfig, - ) -> DatabaseResult<()> { - use crate::schema::tenant_shards::dsl::*; - - self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| { - diesel::update(tenant_shards) - .filter(tenant_id.eq(input_tenant_id.to_string())) - .set((config.eq(serde_json::to_string(&input_config).unwrap()),)) - .execute(conn)?; + query.set(update).execute(conn)?; Ok(()) }) @@ -698,7 +694,7 @@ impl Persistence { } } -/// Parts of [`crate::tenant_state::TenantState`] that are stored durably +/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably #[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)] #[diesel(table_name = crate::schema::tenant_shards)] pub(crate) struct TenantShardPersistence { @@ -728,6 +724,8 @@ pub(crate) struct TenantShardPersistence { pub(crate) splitting: SplitState, #[serde(default)] pub(crate) config: String, + #[serde(default)] + pub(crate) scheduling_policy: String, } impl TenantShardPersistence { diff --git a/control_plane/attachment_service/src/persistence/split_state.rs b/storage_controller/src/persistence/split_state.rs similarity index 100% rename from control_plane/attachment_service/src/persistence/split_state.rs rename to storage_controller/src/persistence/split_state.rs diff --git a/control_plane/attachment_service/src/reconciler.rs b/storage_controller/src/reconciler.rs similarity index 98% rename from control_plane/attachment_service/src/reconciler.rs rename to storage_controller/src/reconciler.rs index a62357f9ac..49cfaad569 100644 --- a/control_plane/attachment_service/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -18,14 +18,14 @@ use utils::sync::gate::GateGuard; use crate::compute_hook::{ComputeHook, NotifyError}; use crate::node::Node; -use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation}; +use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation}; const DEFAULT_HEATMAP_PERIOD: &str = "60s"; /// Object with the lifetime of the background reconcile task that is created /// for tenants which have a difference between their intent and observed states. pub(super) struct Reconciler { - /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot + /// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot /// of a tenant's state from when we spawned a reconcile task. pub(super) tenant_shard_id: TenantShardId, pub(crate) shard: ShardIdentity, @@ -48,11 +48,11 @@ pub(super) struct Reconciler { /// To avoid stalling if the cloud control plane is unavailable, we may proceed /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed - /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry. + /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry. pub(crate) compute_notify_failure: bool, /// A means to abort background reconciliation: it is essential to - /// call this when something changes in the original TenantState that + /// call this when something changes in the original TenantShard that /// will make this reconciliation impossible or unnecessary, for /// example when a pageserver node goes offline, or the PlacementPolicy for /// the tenant is changed. @@ -66,7 +66,7 @@ pub(super) struct Reconciler { pub(crate) persistence: Arc, } -/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any +/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any /// reference counting for Scheduler. The IntentState is what the scheduler works with, /// and the TargetState is just the instruction for a particular Reconciler run. #[derive(Debug)] @@ -487,6 +487,7 @@ impl Reconciler { while let Err(e) = self.compute_notify().await { match e { NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), + NotifyError::ShuttingDown => return Err(ReconcileError::Cancel), _ => { tracing::warn!( "Live migration blocked by compute notification error, retrying: {e}" diff --git a/control_plane/attachment_service/src/scheduler.rs b/storage_controller/src/scheduler.rs similarity index 66% rename from control_plane/attachment_service/src/scheduler.rs rename to storage_controller/src/scheduler.rs index 981ba26cce..3ff0d87988 100644 --- a/control_plane/attachment_service/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -1,4 +1,4 @@ -use crate::{node::Node, tenant_state::TenantState}; +use crate::{node::Node, tenant_shard::TenantShard}; use pageserver_api::controller_api::UtilizationScore; use serde::Serialize; use std::collections::HashMap; @@ -27,7 +27,7 @@ pub enum MaySchedule { #[derive(Serialize)] struct SchedulerNode { - /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`]. + /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`]. shard_count: usize, /// Whether this node is currently elegible to have new shards scheduled (this is derived @@ -58,6 +58,86 @@ pub(crate) struct Scheduler { nodes: HashMap, } +/// Score for soft constraint scheduling: lower scores are preferred to higher scores. +/// +/// For example, we may set an affinity score based on the number of shards from the same +/// tenant already on a node, to implicitly prefer to balance out shards. +#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] +pub(crate) struct AffinityScore(pub(crate) usize); + +impl AffinityScore { + /// If we have no anti-affinity at all toward a node, this is its score. It means + /// the scheduler has a free choice amongst nodes with this score, and may pick a node + /// based on other information such as total utilization. + pub(crate) const FREE: Self = Self(0); + + pub(crate) fn inc(&mut self) { + self.0 += 1; + } +} + +impl std::ops::Add for AffinityScore { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self(self.0 + rhs.0) + } +} + +/// Hint for whether this is a sincere attempt to schedule, or a speculative +/// check for where we _would_ schedule (done during optimization) +#[derive(Debug)] +pub(crate) enum ScheduleMode { + Normal, + Speculative, +} + +impl Default for ScheduleMode { + fn default() -> Self { + Self::Normal + } +} + +// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling +// it for many shards in the same tenant. +#[derive(Debug, Default)] +pub(crate) struct ScheduleContext { + /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`] + pub(crate) nodes: HashMap, + + /// Specifically how many _attached_ locations are on each node + pub(crate) attached_nodes: HashMap, + + pub(crate) mode: ScheduleMode, +} + +impl ScheduleContext { + /// Input is a list of nodes we would like to avoid using again within this context. The more + /// times a node is passed into this call, the less inclined we are to use it. + pub(crate) fn avoid(&mut self, nodes: &[NodeId]) { + for node_id in nodes { + let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE); + entry.inc() + } + } + + pub(crate) fn push_attached(&mut self, node_id: NodeId) { + let entry = self.attached_nodes.entry(node_id).or_default(); + *entry += 1; + } + + pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore { + self.nodes + .get(&node_id) + .copied() + .unwrap_or(AffinityScore::FREE) + } + + pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize { + self.attached_nodes.get(&node_id).copied().unwrap_or(0) + } +} + impl Scheduler { pub(crate) fn new<'a>(nodes: impl Iterator) -> Self { let mut scheduler_nodes = HashMap::new(); @@ -83,7 +163,7 @@ impl Scheduler { pub(crate) fn consistency_check<'a>( &self, nodes: impl Iterator, - shards: impl Iterator, + shards: impl Iterator, ) -> anyhow::Result<()> { let mut expect_nodes: HashMap = HashMap::new(); for node in nodes { @@ -224,53 +304,87 @@ impl Scheduler { node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None }) } - pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result { + /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they + /// are already in use by this shard -- we use this to avoid picking the same node + /// as both attached and secondary location. This is a hard constraint: if we cannot + /// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`]. + /// + /// context: we prefer to avoid using nodes identified in the context, according + /// to their anti-affinity score. We use this to prefeer to avoid placing shards in + /// the same tenant on the same node. This is a soft constraint: the context will never + /// cause us to fail to schedule a shard. + pub(crate) fn schedule_shard( + &self, + hard_exclude: &[NodeId], + context: &ScheduleContext, + ) -> Result { if self.nodes.is_empty() { return Err(ScheduleError::NoPageservers); } - let mut tenant_counts: Vec<(NodeId, usize)> = self + let mut scores: Vec<(NodeId, AffinityScore, usize)> = self .nodes .iter() .filter_map(|(k, v)| { if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No { None } else { - Some((*k, v.shard_count)) + Some(( + *k, + context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE), + v.shard_count, + )) } }) .collect(); - // Sort by tenant count. Nodes with the same tenant count are sorted by ID. - tenant_counts.sort_by_key(|i| (i.1, i.0)); + // Sort by, in order of precedence: + // 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available + // 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes. + // 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems. + scores.sort_by_key(|i| (i.1, i.2, i.0)); - if tenant_counts.is_empty() { - // After applying constraints, no pageservers were left. We log some detail about - // the state of nodes to help understand why this happened. This is not logged as an error because - // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard. - tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:"); - for (node_id, node) in &self.nodes { + if scores.is_empty() { + // After applying constraints, no pageservers were left. + if !matches!(context.mode, ScheduleMode::Speculative) { + // If this was not a speculative attempt, log details to understand why we couldn't + // schedule: this may help an engineer understand if some nodes are marked offline + // in a way that's preventing progress. tracing::info!( - "Node {node_id}: may_schedule={} shards={}", - node.may_schedule != MaySchedule::No, - node.shard_count + "Scheduling failure, while excluding {hard_exclude:?}, node states:" ); + for (node_id, node) in &self.nodes { + tracing::info!( + "Node {node_id}: may_schedule={} shards={}", + node.may_schedule != MaySchedule::No, + node.shard_count + ); + } } - return Err(ScheduleError::ImpossibleConstraint); } - let node_id = tenant_counts.first().unwrap().0; - tracing::info!( - "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})", - tenant_counts.iter().map(|i| i.0 .0).collect::>() + // Lowest score wins + let node_id = scores.first().unwrap().0; + + if !matches!(context.mode, ScheduleMode::Speculative) { + tracing::info!( + "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})", + scores.iter().map(|i| i.0 .0).collect::>() ); + } // Note that we do not update shard count here to reflect the scheduling: that // is IntentState's job when the scheduled location is used. Ok(node_id) } + + /// Unit test access to internal state + #[cfg(test)] + pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize { + self.nodes.get(&node_id).unwrap().shard_count + } } #[cfg(test)] @@ -307,7 +421,7 @@ pub(crate) mod test_utils { mod tests { use super::*; - use crate::tenant_state::IntentState; + use crate::tenant_shard::IntentState; #[test] fn scheduler_basic() -> anyhow::Result<()> { let nodes = test_utils::make_test_nodes(2); @@ -316,15 +430,17 @@ mod tests { let mut t1_intent = IntentState::new(); let mut t2_intent = IntentState::new(); - let scheduled = scheduler.schedule_shard(&[])?; + let context = ScheduleContext::default(); + + let scheduled = scheduler.schedule_shard(&[], &context)?; t1_intent.set_attached(&mut scheduler, Some(scheduled)); - let scheduled = scheduler.schedule_shard(&[])?; + let scheduled = scheduler.schedule_shard(&[], &context)?; t2_intent.set_attached(&mut scheduler, Some(scheduled)); assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1); assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1); - let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?; + let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?; t1_intent.push_secondary(&mut scheduler, scheduled); assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1); diff --git a/control_plane/attachment_service/src/schema.rs b/storage_controller/src/schema.rs similarity index 95% rename from control_plane/attachment_service/src/schema.rs rename to storage_controller/src/schema.rs index 76e4e56a66..ff37d0fe77 100644 --- a/control_plane/attachment_service/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -22,6 +22,7 @@ diesel::table! { placement_policy -> Varchar, splitting -> Int2, config -> Text, + scheduling_policy -> Varchar, } } diff --git a/control_plane/attachment_service/src/service.rs b/storage_controller/src/service.rs similarity index 89% rename from control_plane/attachment_service/src/service.rs rename to storage_controller/src/service.rs index 925910253b..0565f8e7b4 100644 --- a/control_plane/attachment_service/src/service.rs +++ b/storage_controller/src/service.rs @@ -8,7 +8,10 @@ use std::{ }; use crate::{ - id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError, + id_lock_map::IdLockMap, + persistence::{AbortShardSplitStatus, TenantFilter}, + reconciler::ReconcileError, + scheduler::{ScheduleContext, ScheduleMode}, }; use anyhow::Context; use control_plane::storage_controller::{ @@ -17,12 +20,14 @@ use control_plane::storage_controller::{ use diesel::result::DatabaseErrorKind; use futures::{stream::FuturesUnordered, StreamExt}; use hyper::StatusCode; +use itertools::Itertools; use pageserver_api::{ controller_api::{ NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, - TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, - TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest, - TenantShardMigrateResponse, UtilizationScore, + ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard, + TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, + TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, + UtilizationScore, }, models::{SecondaryProgress, TenantConfigRequest}, }; @@ -51,7 +56,6 @@ use utils::{ generation::Generation, http::error::ApiError, id::{NodeId, TenantId, TimelineId}, - seqwait::SeqWait, sync::gate::Gate, }; @@ -62,11 +66,10 @@ use crate::{ persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence}, reconciler::attached_location_conf, scheduler::Scheduler, - tenant_state::{ + tenant_shard::{ IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError, - ReconcilerWaiter, TenantState, + ReconcilerWaiter, TenantShard, }, - Sequence, }; // For operations that should be quick, like attaching a new tenant @@ -89,7 +92,7 @@ pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30); // Top level state available to all HTTP handlers struct ServiceState { - tenants: BTreeMap, + tenants: BTreeMap, nodes: Arc>, @@ -99,7 +102,7 @@ struct ServiceState { impl ServiceState { fn new( nodes: HashMap, - tenants: BTreeMap, + tenants: BTreeMap, scheduler: Scheduler, ) -> Self { Self { @@ -113,7 +116,7 @@ impl ServiceState { &mut self, ) -> ( &mut Arc>, - &mut BTreeMap, + &mut BTreeMap, &mut Scheduler, ) { (&mut self.nodes, &mut self.tenants, &mut self.scheduler) @@ -332,11 +335,11 @@ impl Service { for (tenant_shard_id, shard_observations) in observed { for (node_id, observed_loc) in shard_observations { - let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else { + let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { cleanup.push((tenant_shard_id, node_id)); continue; }; - tenant_state + tenant_shard .observed .locations .insert(node_id, ObservedStateLocation { conf: observed_loc }); @@ -344,9 +347,15 @@ impl Service { } // Populate each tenant's intent state - for (tenant_shard_id, tenant_state) in tenants.iter_mut() { - tenant_state.intent_from_observed(scheduler); - if let Err(e) = tenant_state.schedule(scheduler) { + let mut schedule_context = ScheduleContext::default(); + for (tenant_shard_id, tenant_shard) in tenants.iter_mut() { + if tenant_shard_id.shard_number == ShardNumber(0) { + // Reset scheduling context each time we advance to the next Tenant + schedule_context = ScheduleContext::default(); + } + + tenant_shard.intent_from_observed(scheduler); + if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) { // Non-fatal error: we are unable to properly schedule the tenant, perhaps because // not enough pageservers are available. The tenant may well still be available // to clients. @@ -355,11 +364,11 @@ impl Service { // If we're both intending and observed to be attached at a particular node, we will // emit a compute notification for this. In the case where our observed state does not // yet match our intent, we will eventually reconcile, and that will emit a compute notification. - if let Some(attached_at) = tenant_state.stably_attached() { + if let Some(attached_at) = tenant_shard.stably_attached() { compute_notifications.push(( *tenant_shard_id, attached_at, - tenant_state.shard.stripe_size, + tenant_shard.shard.stripe_size, )); } } @@ -670,7 +679,13 @@ impl Service { let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD); while !self.cancel.is_cancelled() { tokio::select! { - _ = interval.tick() => { self.reconcile_all(); } + _ = interval.tick() => { + let reconciles_spawned = self.reconcile_all(); + if reconciles_spawned == 0 { + // Run optimizer only when we didn't find any other work to do + self.optimize_all(); + } + } _ = self.cancel.cancelled() => return } } @@ -728,7 +743,7 @@ impl Service { /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation /// was successful, this will update the observed state of the tenant such that subsequent - /// calls to [`TenantState::maybe_reconcile`] will do nothing. + /// calls to [`TenantShard::maybe_reconcile`] will do nothing. #[instrument(skip_all, fields( tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), sequence=%result.sequence @@ -746,10 +761,10 @@ impl Service { tenant.generation = std::cmp::max(tenant.generation, result.generation); // If the reconciler signals that it failed to notify compute, set this state on - // the shard so that a future [`TenantState::maybe_reconcile`] will try again. + // the shard so that a future [`TenantShard::maybe_reconcile`] will try again. tenant.pending_compute_notification = result.pending_compute_notification; - // Let the TenantState know it is idle. + // Let the TenantShard know it is idle. tenant.reconcile_complete(result.sequence); match result.result { @@ -957,30 +972,14 @@ impl Service { } for tsp in tenant_shard_persistence { let tenant_shard_id = tsp.get_tenant_shard_id()?; - let shard_identity = tsp.get_shard_identity()?; + // We will populate intent properly later in [`Self::startup_reconcile`], initially populate // it with what we can infer: the node for which a generation was most recently issued. let mut intent = IntentState::new(); if let Some(generation_pageserver) = tsp.generation_pageserver { intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64))); } - - let new_tenant = TenantState { - tenant_shard_id, - shard: shard_identity, - sequence: Sequence::initial(), - generation: tsp.generation.map(|g| Generation::new(g as u32)), - policy: serde_json::from_str(&tsp.placement_policy).unwrap(), - intent, - observed: ObservedState::new(), - config: serde_json::from_str(&tsp.config).unwrap(), - reconciler: None, - splitting: tsp.splitting, - waiter: Arc::new(SeqWait::new(Sequence::initial())), - error_waiter: Arc::new(SeqWait::new(Sequence::initial())), - last_error: Arc::default(), - pending_compute_notification: false, - }; + let new_tenant = TenantShard::from_persistent(tsp, intent)?; tenants.insert(tenant_shard_id, new_tenant); } @@ -1104,6 +1103,8 @@ impl Service { placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(), config: serde_json::to_string(&TenantConfig::default()).unwrap(), splitting: SplitState::default(), + scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) + .unwrap(), }; match self.persistence.insert_tenant_shards(vec![tsp]).await { @@ -1125,7 +1126,7 @@ impl Service { let mut locked = self.inner.write().unwrap(); locked.tenants.insert( attach_req.tenant_shard_id, - TenantState::new( + TenantShard::new( attach_req.tenant_shard_id, ShardIdentity::unsharded(), PlacementPolicy::Attached(0), @@ -1156,9 +1157,10 @@ impl Service { // when we reattaching a detached tenant. self.persistence .update_tenant_shard( - attach_req.tenant_shard_id, - PlacementPolicy::Attached(0), - conf, + TenantFilter::Shard(attach_req.tenant_shard_id), + Some(PlacementPolicy::Attached(0)), + Some(conf), + None, None, ) .await?; @@ -1176,32 +1178,32 @@ impl Service { let mut locked = self.inner.write().unwrap(); let (_nodes, tenants, scheduler) = locked.parts_mut(); - let tenant_state = tenants + let tenant_shard = tenants .get_mut(&attach_req.tenant_shard_id) .expect("Checked for existence above"); if let Some(new_generation) = new_generation { - tenant_state.generation = Some(new_generation); - tenant_state.policy = PlacementPolicy::Attached(0); + tenant_shard.generation = Some(new_generation); + tenant_shard.policy = PlacementPolicy::Attached(0); } else { // This is a detach notification. We must update placement policy to avoid re-attaching // during background scheduling/reconciliation, or during storage controller restart. assert!(attach_req.node_id.is_none()); - tenant_state.policy = PlacementPolicy::Detached; + tenant_shard.policy = PlacementPolicy::Detached; } if let Some(attaching_pageserver) = attach_req.node_id.as_ref() { tracing::info!( tenant_id = %attach_req.tenant_shard_id, ps_id = %attaching_pageserver, - generation = ?tenant_state.generation, + generation = ?tenant_shard.generation, "issuing", ); - } else if let Some(ps_id) = tenant_state.intent.get_attached() { + } else if let Some(ps_id) = tenant_shard.intent.get_attached() { tracing::info!( tenant_id = %attach_req.tenant_shard_id, %ps_id, - generation = ?tenant_state.generation, + generation = ?tenant_shard.generation, "dropping", ); } else { @@ -1209,14 +1211,14 @@ impl Service { tenant_id = %attach_req.tenant_shard_id, "no-op: tenant already has no pageserver"); } - tenant_state + tenant_shard .intent .set_attached(scheduler, attach_req.node_id); tracing::info!( "attach_hook: tenant {} set generation {:?}, pageserver {}", attach_req.tenant_shard_id, - tenant_state.generation, + tenant_shard.generation, // TODO: this is an odd number of 0xf's attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) ); @@ -1228,36 +1230,36 @@ impl Service { #[cfg(feature = "testing")] { if let Some(node_id) = attach_req.node_id { - tenant_state.observed.locations = HashMap::from([( + tenant_shard.observed.locations = HashMap::from([( node_id, ObservedStateLocation { conf: Some(attached_location_conf( - tenant_state.generation.unwrap(), - &tenant_state.shard, - &tenant_state.config, + tenant_shard.generation.unwrap(), + &tenant_shard.shard, + &tenant_shard.config, false, )), }, )]); } else { - tenant_state.observed.locations.clear(); + tenant_shard.observed.locations.clear(); } } Ok(AttachHookResponse { gen: attach_req .node_id - .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), + .map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), }) } pub(crate) fn inspect(&self, inspect_req: InspectRequest) -> InspectResponse { let locked = self.inner.read().unwrap(); - let tenant_state = locked.tenants.get(&inspect_req.tenant_shard_id); + let tenant_shard = locked.tenants.get(&inspect_req.tenant_shard_id); InspectResponse { - attachment: tenant_state.and_then(|s| { + attachment: tenant_shard.and_then(|s| { s.intent .get_attached() .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps)) @@ -1319,11 +1321,11 @@ impl Service { let mut locked = self.inner.write().unwrap(); for (tenant_shard_id, observed_loc) in configs.tenant_shards { - let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else { + let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else { cleanup.push(tenant_shard_id); continue; }; - tenant_state + tenant_shard .observed .locations .insert(node.get_id(), ObservedStateLocation { conf: observed_loc }); @@ -1494,13 +1496,13 @@ impl Service { }; for req_tenant in validate_req.tenants { - if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) { - let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen)); + if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) { + let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen)); tracing::info!( "handle_validate: {}(gen {}): valid={valid} (latest {:?})", req_tenant.id, req_tenant.gen, - tenant_state.generation + tenant_shard.generation ); response.tenants.push(ValidateResponseTenant { id: req_tenant.id, @@ -1615,6 +1617,8 @@ impl Service { placement_policy: serde_json::to_string(&placement_policy).unwrap(), config: serde_json::to_string(&create_req.config).unwrap(), splitting: SplitState::default(), + scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) + .unwrap(), }) .collect(); @@ -1637,6 +1641,8 @@ impl Service { Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))), }; + let mut schedule_context = ScheduleContext::default(); + let (waiters, response_shards) = { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); @@ -1658,11 +1664,14 @@ impl Service { // attached and secondary locations (independently) away frorm those // pageservers also holding a shard for this tenant. - entry.get_mut().schedule(scheduler).map_err(|e| { - ApiError::Conflict(format!( - "Failed to schedule shard {tenant_shard_id}: {e}" - )) - })?; + entry + .get_mut() + .schedule(scheduler, &mut schedule_context) + .map_err(|e| { + ApiError::Conflict(format!( + "Failed to schedule shard {tenant_shard_id}: {e}" + )) + })?; if let Some(node_id) = entry.get().intent.get_attached() { let generation = entry @@ -1679,7 +1688,7 @@ impl Service { continue; } Entry::Vacant(entry) => { - let state = entry.insert(TenantState::new( + let state = entry.insert(TenantShard::new( tenant_shard_id, ShardIdentity::from_params( tenant_shard_id.shard_number, @@ -1690,7 +1699,7 @@ impl Service { state.generation = initial_generation; state.config = create_req.config.clone(); - if let Err(e) = state.schedule(scheduler) { + if let Err(e) = state.schedule(scheduler, &mut schedule_context) { schcedule_error = Some(e); } @@ -1754,6 +1763,9 @@ impl Service { /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request, /// and transform it into either a tenant creation of a series of shard updates. + /// + /// If the incoming request makes no changes, a [`TenantCreateOrUpdate::Update`] result will + /// still be returned. fn tenant_location_config_prepare( &self, tenant_id: TenantId, @@ -1801,17 +1813,12 @@ impl Service { _ => None, }; - if shard.policy != placement_policy - || shard.config != req.config.tenant_conf - || set_generation.is_some() - { - updates.push(ShardUpdate { - tenant_shard_id: *shard_id, - placement_policy: placement_policy.clone(), - tenant_config: req.config.tenant_conf.clone(), - generation: set_generation, - }); - } + updates.push(ShardUpdate { + tenant_shard_id: *shard_id, + placement_policy: placement_policy.clone(), + tenant_config: req.config.tenant_conf.clone(), + generation: set_generation, + }); } if create { @@ -1840,6 +1847,7 @@ impl Service { }, ) } else { + assert!(!updates.is_empty()); TenantCreateOrUpdate::Update(updates) } } @@ -1898,6 +1906,7 @@ impl Service { // Persist updates // Ordering: write to the database before applying changes in-memory, so that // we will not appear time-travel backwards on a restart. + let mut schedule_context = ScheduleContext::default(); for ShardUpdate { tenant_shard_id, placement_policy, @@ -1907,10 +1916,11 @@ impl Service { { self.persistence .update_tenant_shard( - *tenant_shard_id, - placement_policy.clone(), - tenant_config.clone(), + TenantFilter::Shard(*tenant_shard_id), + Some(placement_policy.clone()), + Some(tenant_config.clone()), *generation, + None, ) .await?; } @@ -1944,7 +1954,7 @@ impl Service { shard.generation = Some(generation); } - shard.schedule(scheduler)?; + shard.schedule(scheduler, &mut schedule_context)?; let maybe_waiter = self.maybe_reconcile_shard(shard, nodes); if let Some(waiter) = maybe_waiter { @@ -1988,7 +1998,13 @@ impl Service { let config = req.config; self.persistence - .update_tenant_config(req.tenant_id, config.clone()) + .update_tenant_shard( + TenantFilter::Tenant(req.tenant_id), + None, + Some(config.clone()), + None, + None, + ) .await?; let waiters = { @@ -2098,7 +2114,7 @@ impl Service { let scheduler = &locked.scheduler; // Right now we only perform the operation on a single node without parallelization // TODO fan out the operation to multiple nodes for better performance - let node_id = scheduler.schedule_shard(&[])?; + let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?; let node = locked .nodes .get(&node_id) @@ -2341,6 +2357,58 @@ impl Service { Ok(StatusCode::NOT_FOUND) } + /// Naming: this configures the storage controller's policies for a tenant, whereas [`Self::tenant_config_set`] is "set the TenantConfig" + /// for a tenant. The TenantConfig is passed through to pageservers, whereas this function modifies + /// the tenant's policies (configuration) within the storage controller + pub(crate) async fn tenant_update_policy( + &self, + tenant_id: TenantId, + req: TenantPolicyRequest, + ) -> Result<(), ApiError> { + // We require an exclusive lock, because we are updating persistent and in-memory state + let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await; + + let TenantPolicyRequest { + placement, + scheduling, + } = req; + + self.persistence + .update_tenant_shard( + TenantFilter::Tenant(tenant_id), + placement.clone(), + None, + None, + scheduling, + ) + .await?; + + let mut schedule_context = ScheduleContext::default(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + if let Some(placement) = &placement { + shard.policy = placement.clone(); + + tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(), + "Updated placement policy to {placement:?}"); + } + + if let Some(scheduling) = &scheduling { + shard.set_scheduling_policy(*scheduling); + + tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(), + "Updated scheduling policy to {scheduling:?}"); + } + + // In case scheduling is being switched back on, try it now. + shard.schedule(scheduler, &mut schedule_context).ok(); + self.maybe_reconcile_shard(shard, nodes); + } + + Ok(()) + } + pub(crate) async fn tenant_timeline_create( &self, tenant_id: TenantId, @@ -2667,47 +2735,73 @@ impl Service { }) } - pub(crate) fn tenant_describe( + /// Returns None if the input iterator of shards does not include a shard with number=0 + fn tenant_describe_impl<'a>( &self, - tenant_id: TenantId, - ) -> Result { - let locked = self.inner.read().unwrap(); - + shards: impl Iterator, + ) -> Option { let mut shard_zero = None; - let mut shards = Vec::new(); + let mut describe_shards = Vec::new(); - for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) - { - if tenant_shard_id.is_zero() { + for shard in shards { + if shard.tenant_shard_id.is_shard_zero() { shard_zero = Some(shard); } - let response_shard = TenantDescribeResponseShard { - tenant_shard_id: *tenant_shard_id, + describe_shards.push(TenantDescribeResponseShard { + tenant_shard_id: shard.tenant_shard_id, node_attached: *shard.intent.get_attached(), node_secondary: shard.intent.get_secondary().to_vec(), last_error: shard.last_error.lock().unwrap().clone(), is_reconciling: shard.reconciler.is_some(), is_pending_compute_notification: shard.pending_compute_notification, is_splitting: matches!(shard.splitting, SplitState::Splitting), - }; - shards.push(response_shard); + scheduling_policy: *shard.get_scheduling_policy(), + }) } - let Some(shard_zero) = shard_zero else { - return Err(ApiError::NotFound( - anyhow::anyhow!("Tenant {tenant_id} not found").into(), - )); - }; + let shard_zero = shard_zero?; - Ok(TenantDescribeResponse { - shards, + Some(TenantDescribeResponse { + tenant_id: shard_zero.tenant_shard_id.tenant_id, + shards: describe_shards, stripe_size: shard_zero.shard.stripe_size, policy: shard_zero.policy.clone(), config: shard_zero.config.clone(), }) } + pub(crate) fn tenant_describe( + &self, + tenant_id: TenantId, + ) -> Result { + let locked = self.inner.read().unwrap(); + + self.tenant_describe_impl( + locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .map(|(_k, v)| v), + ) + .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into())) + } + + pub(crate) fn tenant_list(&self) -> Vec { + let locked = self.inner.read().unwrap(); + + let mut result = Vec::new(); + for (_tenant_id, tenant_shards) in + &locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id) + { + result.push( + self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v)) + .expect("Groups are always non-empty"), + ); + } + + result + } + #[instrument(skip_all, fields(tenant_id=%op.tenant_id))] async fn abort_tenant_shard_split( &self, @@ -2798,7 +2892,7 @@ impl Service { tracing::info!("Restoring parent shard {tenant_shard_id}"); shard.splitting = SplitState::Idle; - if let Err(e) = shard.schedule(scheduler) { + if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) { // If this shard can't be scheduled now (perhaps due to offline nodes or // capacity issues), that must not prevent us rolling back a split. In this // case it should be eventually scheduled in the background. @@ -2922,6 +3016,7 @@ impl Service { ) }; + let mut schedule_context = ScheduleContext::default(); for child in child_ids { let mut child_shard = parent_ident; child_shard.number = child.shard_number; @@ -2943,7 +3038,7 @@ impl Service { }, ); - let mut child_state = TenantState::new(child, child_shard, policy.clone()); + let mut child_state = TenantShard::new(child, child_shard, policy.clone()); child_state.intent = IntentState::single(scheduler, Some(pageserver)); child_state.observed = ObservedState { locations: child_observed, @@ -2951,13 +3046,13 @@ impl Service { child_state.generation = Some(generation); child_state.config = config.clone(); - // The child's TenantState::splitting is intentionally left at the default value of Idle, + // The child's TenantShard::splitting is intentionally left at the default value of Idle, // as at this point in the split process we have succeeded and this part is infallible: // we will never need to do any special recovery from this state. child_locations.push((child, pageserver, child_shard.stripe_size)); - if let Err(e) = child_state.schedule(scheduler) { + if let Err(e) = child_state.schedule(scheduler, &mut schedule_context) { // This is not fatal, because we've implicitly already got an attached // location for the child shard. Failure here just means we couldn't // find a secondary (e.g. because cluster is overloaded). @@ -3250,6 +3345,10 @@ impl Service { placement_policy: serde_json::to_string(&policy).unwrap(), config: serde_json::to_string(&config).unwrap(), splitting: SplitState::Splitting, + + // Scheduling policies do not carry through to children + scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) + .unwrap(), }); } @@ -3496,8 +3595,8 @@ impl Service { Ok(()) } - /// For debug/support: a full JSON dump of TenantStates. Returns a response so that - /// we don't have to make TenantState clonable in the return path. + /// For debug/support: a full JSON dump of TenantShards. Returns a response so that + /// we don't have to make TenantShard clonable in the return path. pub(crate) fn tenants_dump(&self) -> Result, ApiError> { let serialized = { let locked = self.inner.read().unwrap(); @@ -3601,7 +3700,7 @@ impl Service { } /// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that - /// we don't have to make TenantState clonable in the return path. + /// we don't have to make TenantShard clonable in the return path. pub(crate) fn scheduler_dump(&self) -> Result, ApiError> { let serialized = { let locked = self.inner.read().unwrap(); @@ -3817,8 +3916,9 @@ impl Service { AvailabilityTransition::ToOffline => { tracing::info!("Node {} transition to offline", node_id); let mut tenants_affected: usize = 0; - for (tenant_shard_id, tenant_state) in tenants { - if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) { + + for (tenant_shard_id, tenant_shard) in tenants { + if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { // When a node goes offline, we set its observed configuration to None, indicating unknown: we will // not assume our knowledge of the node's configuration is accurate until it comes back online observed_loc.conf = None; @@ -3831,18 +3931,24 @@ impl Service { continue; } - if tenant_state.intent.demote_attached(node_id) { - tenant_state.sequence = tenant_state.sequence.next(); - match tenant_state.schedule(scheduler) { + if tenant_shard.intent.demote_attached(node_id) { + tenant_shard.sequence = tenant_shard.sequence.next(); + + // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters + // for tenants without secondary locations: if they have a secondary location, then this + // schedule() call is just promoting an existing secondary) + let mut schedule_context = ScheduleContext::default(); + + match tenant_shard.schedule(scheduler, &mut schedule_context) { Err(e) => { // It is possible that some tenants will become unschedulable when too many pageservers // go offline: in this case there isn't much we can do other than make the issue observable. - // TODO: give TenantState a scheduling error attribute to be queried later. + // TODO: give TenantShard a scheduling error attribute to be queried later. tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id); } Ok(()) => { if self - .maybe_reconcile_shard(tenant_state, &new_nodes) + .maybe_reconcile_shard(tenant_shard, &new_nodes) .is_some() { tenants_affected += 1; @@ -3861,10 +3967,10 @@ impl Service { tracing::info!("Node {} transition to active", node_id); // When a node comes back online, we must reconcile any tenant that has a None observed // location on the node. - for tenant_state in locked.tenants.values_mut() { - if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) { + for tenant_shard in locked.tenants.values_mut() { + if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { if observed_loc.conf.is_none() { - self.maybe_reconcile_shard(tenant_state, &new_nodes); + self.maybe_reconcile_shard(tenant_shard, &new_nodes); } } } @@ -3884,9 +3990,6 @@ impl Service { /// Helper for methods that will try and call pageserver APIs for /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant /// is attached somewhere. - /// - /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is - /// an attached policy. We should error out if it isn't. fn ensure_attached_schedule( &self, mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>, @@ -3895,10 +3998,27 @@ impl Service { let mut waiters = Vec::new(); let (nodes, tenants, scheduler) = locked.parts_mut(); - for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { - shard.schedule(scheduler)?; + let mut schedule_context = ScheduleContext::default(); + for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + shard.schedule(scheduler, &mut schedule_context)?; + + // The shard's policies may not result in an attached location being scheduled: this + // is an error because our caller needs it attached somewhere. + if shard.intent.get_attached().is_none() { + return Err(anyhow::anyhow!( + "Tenant {tenant_id} not scheduled to be attached" + )); + }; + + if shard.stably_attached().is_some() { + // We do not require the shard to be totally up to date on reconciliation: we just require + // that it has been attached on the intended node. Other dirty state such as unattached secondary + // locations, or compute hook notifications can be ignored. + continue; + } if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached"); waiters.push(waiter); } } @@ -3933,11 +4053,11 @@ impl Service { Ok(()) } - /// Convenience wrapper around [`TenantState::maybe_reconcile`] that provides + /// Convenience wrapper around [`TenantShard::maybe_reconcile`] that provides /// all the references to parts of Self that are needed fn maybe_reconcile_shard( &self, - shard: &mut TenantState, + shard: &mut TenantShard, nodes: &Arc>, ) -> Option { shard.maybe_reconcile( @@ -3960,8 +4080,145 @@ impl Service { let (nodes, tenants, _scheduler) = locked.parts_mut(); let pageservers = nodes.clone(); + let mut schedule_context = ScheduleContext::default(); + let mut reconciles_spawned = 0; - for (_tenant_shard_id, shard) in tenants.iter_mut() { + for (tenant_shard_id, shard) in tenants.iter_mut() { + if tenant_shard_id.is_shard_zero() { + schedule_context = ScheduleContext::default(); + } + + // Eventual consistency: if an earlier reconcile job failed, and the shard is still + // dirty, spawn another rone + if self.maybe_reconcile_shard(shard, &pageservers).is_some() { + reconciles_spawned += 1; + } + + schedule_context.avoid(&shard.intent.all_pageservers()); + } + + reconciles_spawned + } + + /// `optimize` in this context means identifying shards which have valid scheduled locations, but + /// could be scheduled somewhere better: + /// - Cutting over to a secondary if the node with the secondary is more lightly loaded + /// * e.g. after a node fails then recovers, to move some work back to it + /// - Cutting over to a secondary if it improves the spread of shard attachments within a tenant + /// * e.g. after a shard split, the initial attached locations will all be on the node where + /// we did the split, but are probably better placed elsewhere. + /// - Creating new secondary locations if it improves the spreading of a sharded tenant + /// * e.g. after a shard split, some locations will be on the same node (where the split + /// happened), and will probably be better placed elsewhere. + /// + /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at + /// the time of scheduling, this function looks for cases where a better-scoring location is available + /// according to those same soft constraints. + fn optimize_all(&self) -> usize { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + let pageservers = nodes.clone(); + + let mut schedule_context = ScheduleContext::default(); + + let mut reconciles_spawned = 0; + + let mut tenant_shards: Vec<&TenantShard> = Vec::new(); + + // Limit on how many shards' optmizations each call to this function will execute. Combined + // with the frequency of background calls, this acts as an implicit rate limit that runs a small + // trickle of optimizations in the background, rather than executing a large number in parallel + // when a change occurs. + const MAX_OPTIMIZATIONS_PER_PASS: usize = 2; + + let mut work = Vec::new(); + + for (tenant_shard_id, shard) in tenants.iter() { + if tenant_shard_id.is_shard_zero() { + // Reset accumulators on the first shard in a tenant + schedule_context = ScheduleContext::default(); + schedule_context.mode = ScheduleMode::Speculative; + tenant_shards.clear(); + } + + if work.len() >= MAX_OPTIMIZATIONS_PER_PASS { + break; + } + + match shard.get_scheduling_policy() { + ShardSchedulingPolicy::Active => { + // Ok to do optimization + } + ShardSchedulingPolicy::Essential + | ShardSchedulingPolicy::Pause + | ShardSchedulingPolicy::Stop => { + // Policy prevents optimizing this shard. + continue; + } + } + + // Accumulate the schedule context for all the shards in a tenant: we must have + // the total view of all shards before we can try to optimize any of them. + schedule_context.avoid(&shard.intent.all_pageservers()); + if let Some(attached) = shard.intent.get_attached() { + schedule_context.push_attached(*attached); + } + tenant_shards.push(shard); + + // Once we have seen the last shard in the tenant, proceed to search across all shards + // in the tenant for optimizations + if shard.shard.number.0 == shard.shard.count.count() - 1 { + if tenant_shards.iter().any(|s| s.reconciler.is_some()) { + // Do not start any optimizations while another change to the tenant is ongoing: this + // is not necessary for correctness, but simplifies operations and implicitly throttles + // optimization changes to happen in a "trickle" over time. + continue; + } + + if tenant_shards.iter().any(|s| { + !matches!(s.splitting, SplitState::Idle) + || matches!(s.policy, PlacementPolicy::Detached) + }) { + // Never attempt to optimize a tenant that is currently being split, or + // a tenant that is meant to be detached + continue; + } + + // TODO: optimization calculations are relatively expensive: create some fast-path for + // the common idle case (avoiding the search on tenants that we have recently checked) + + for shard in &tenant_shards { + if let Some(optimization) = + // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to + // its primary location based on soft constraints, cut it over. + shard.optimize_attachment(nodes, &schedule_context) + { + work.push((shard.tenant_shard_id, optimization)); + break; + } else if let Some(optimization) = + // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be + // better placed on another node, based on ScheduleContext, then adjust it. This + // covers cases like after a shard split, where we might have too many shards + // in the same tenant with secondary locations on the node where they originally split. + shard.optimize_secondary(scheduler, &schedule_context) + { + work.push((shard.tenant_shard_id, optimization)); + break; + } + + // TODO: extend this mechanism to prefer attaching on nodes with fewer attached + // tenants (i.e. extend schedule state to distinguish attached from secondary counts), + // for the total number of attachments on a node (not just within a tenant.) + } + } + } + + for (tenant_shard_id, optimization) in work { + let shard = tenants + .get_mut(&tenant_shard_id) + .expect("We held lock from place we got this ID"); + shard.apply_optimization(scheduler, optimization); + if self.maybe_reconcile_shard(shard, &pageservers).is_some() { reconciles_spawned += 1; } @@ -3970,9 +4227,35 @@ impl Service { reconciles_spawned } + /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but + /// also wait for any generated Reconcilers to complete. Calling this until it returns zero should + /// put the system into a quiescent state where future background reconciliations won't do anything. + pub(crate) async fn reconcile_all_now(&self) -> Result { + let reconciles_spawned = self.reconcile_all(); + if reconciles_spawned == 0 { + // Only optimize when we are otherwise idle + self.optimize_all(); + } + + let waiters = { + let mut waiters = Vec::new(); + let locked = self.inner.read().unwrap(); + for (_tenant_shard_id, shard) in locked.tenants.iter() { + if let Some(waiter) = shard.get_waiter() { + waiters.push(waiter); + } + } + waiters + }; + + let waiter_count = waiters.len(); + self.await_waiters(waiters, RECONCILE_TIMEOUT).await?; + Ok(waiter_count) + } + pub async fn shutdown(&self) { // Note that this already stops processing any results from reconciles: so - // we do not expect that our [`TenantState`] objects will reach a neat + // we do not expect that our [`TenantShard`] objects will reach a neat // final state. self.cancel.cancel(); diff --git a/control_plane/attachment_service/src/tenant_state.rs b/storage_controller/src/tenant_shard.rs similarity index 60% rename from control_plane/attachment_service/src/tenant_state.rs rename to storage_controller/src/tenant_shard.rs index 83c921dc58..58b8ef8d5d 100644 --- a/control_plane/attachment_service/src/tenant_state.rs +++ b/storage_controller/src/tenant_shard.rs @@ -7,8 +7,9 @@ use std::{ use crate::{ metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, persistence::TenantShardPersistence, + scheduler::{AffinityScore, MaySchedule, ScheduleContext}, }; -use pageserver_api::controller_api::PlacementPolicy; +use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy}; use pageserver_api::{ models::{LocationConfig, LocationConfigMode, TenantConfig}, shard::{ShardIdentity, TenantShardId}, @@ -49,7 +50,7 @@ where /// This struct implement Serialize for debugging purposes, but is _not_ persisted /// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted. #[derive(Serialize)] -pub(crate) struct TenantState { +pub(crate) struct TenantShard { pub(crate) tenant_shard_id: TenantShardId, pub(crate) shard: ShardIdentity, @@ -116,6 +117,10 @@ pub(crate) struct TenantState { /// sending it. This is the mechanism by which compute notifications are included in the scope /// of state that we publish externally in an eventually consistent way. pub(crate) pending_compute_notification: bool, + + // Support/debug tool: if something is going wrong or flapping with scheduling, this may + // be set to a non-active state to avoid making changes while the issue is fixed. + scheduling_policy: ShardSchedulingPolicy, } #[derive(Default, Clone, Debug, Serialize)] @@ -246,8 +251,13 @@ impl IntentState { impl Drop for IntentState { fn drop(&mut self) { - // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler - debug_assert!(self.attached.is_none() && self.secondary.is_empty()); + // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler. + // We do not check this while panicking, to avoid polluting unit test failures or + // other assertions with this assertion's output. It's still wrong to leak these, + // but if we already have a panic then we don't need to independently flag this case. + if !(std::thread::panicking()) { + debug_assert!(self.attached.is_none() && self.secondary.is_empty()); + } } } @@ -292,6 +302,26 @@ pub enum ReconcileWaitError { Failed(TenantShardId, String), } +#[derive(Eq, PartialEq, Debug)] +pub(crate) struct ReplaceSecondary { + old_node_id: NodeId, + new_node_id: NodeId, +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) struct MigrateAttachment { + old_attached_node_id: NodeId, + new_attached_node_id: NodeId, +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) enum ScheduleOptimization { + // Replace one of our secondary locations with a different node + ReplaceSecondary(ReplaceSecondary), + // Migrate attachment to an existing secondary location + MigrateAttachment(MigrateAttachment), +} + impl ReconcilerWaiter { pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> { tokio::select! { @@ -324,7 +354,7 @@ pub(crate) struct ReconcilerHandle { } /// When a reconcile task completes, it sends this result object -/// to be applied to the primary TenantState. +/// to be applied to the primary TenantShard. pub(crate) struct ReconcileResult { pub(crate) sequence: Sequence, /// On errors, `observed` should be treated as an incompleted description @@ -337,7 +367,7 @@ pub(crate) struct ReconcileResult { pub(crate) generation: Option, pub(crate) observed: ObservedState, - /// Set [`TenantState::pending_compute_notification`] from this flag + /// Set [`TenantShard::pending_compute_notification`] from this flag pub(crate) pending_compute_notification: bool, } @@ -349,7 +379,7 @@ impl ObservedState { } } -impl TenantState { +impl TenantShard { pub(crate) fn new( tenant_shard_id: TenantShardId, shard: ShardIdentity, @@ -370,6 +400,7 @@ impl TenantState { error_waiter: Arc::new(SeqWait::new(Sequence(0))), last_error: Arc::default(), pending_compute_notification: false, + scheduling_policy: ShardSchedulingPolicy::default(), } } @@ -425,6 +456,7 @@ impl TenantState { fn schedule_attached( &mut self, scheduler: &mut Scheduler, + context: &ScheduleContext, ) -> Result<(bool, NodeId), ScheduleError> { // No work to do if we already have an attached tenant if let Some(node_id) = self.intent.attached { @@ -438,14 +470,33 @@ impl TenantState { Ok((true, promote_secondary)) } else { // Pick a fresh node: either we had no secondaries or none were schedulable - let node_id = scheduler.schedule_shard(&self.intent.secondary)?; + let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?; tracing::debug!("Selected {} as attached", node_id); self.intent.set_attached(scheduler, Some(node_id)); Ok((true, node_id)) } } - pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> { + pub(crate) fn schedule( + &mut self, + scheduler: &mut Scheduler, + context: &mut ScheduleContext, + ) -> Result<(), ScheduleError> { + let r = self.do_schedule(scheduler, context); + + context.avoid(&self.intent.all_pageservers()); + if let Some(attached) = self.intent.get_attached() { + context.push_attached(*attached); + } + + r + } + + pub(crate) fn do_schedule( + &mut self, + scheduler: &mut Scheduler, + context: &ScheduleContext, + ) -> Result<(), ScheduleError> { // TODO: before scheduling new nodes, check if any existing content in // self.intent refers to pageservers that are offline, and pick other // pageservers if so. @@ -453,6 +504,16 @@ impl TenantState { // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not // change their attach location. + match self.scheduling_policy { + ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {} + ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => { + // Warn to make it obvious why other things aren't happening/working, if we skip scheduling + tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Scheduling is disabled by policy {:?}", self.scheduling_policy); + return Ok(()); + } + } + // Build the set of pageservers already in use by this tenant, to avoid scheduling // more work on the same pageservers we're already using. let mut modified = false; @@ -479,12 +540,13 @@ impl TenantState { } // Should have exactly one attached, and N secondaries - let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?; + let (modified_attached, attached_node_id) = + self.schedule_attached(scheduler, context)?; modified |= modified_attached; let mut used_pageservers = vec![attached_node_id]; while self.intent.secondary.len() < secondary_count { - let node_id = scheduler.schedule_shard(&used_pageservers)?; + let node_id = scheduler.schedule_shard(&used_pageservers, context)?; self.intent.push_secondary(scheduler, node_id); used_pageservers.push(node_id); modified = true; @@ -497,7 +559,7 @@ impl TenantState { modified = true; } else if self.intent.secondary.is_empty() { // Populate secondary by scheduling a fresh node - let node_id = scheduler.schedule_shard(&[])?; + let node_id = scheduler.schedule_shard(&[], context)?; self.intent.push_secondary(scheduler, node_id); modified = true; } @@ -524,6 +586,167 @@ impl TenantState { Ok(()) } + /// Optimize attachments: if a shard has a secondary location that is preferable to + /// its primary location based on soft constraints, switch that secondary location + /// to be attached. + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn optimize_attachment( + &self, + nodes: &HashMap, + schedule_context: &ScheduleContext, + ) -> Option { + let attached = (*self.intent.get_attached())?; + if self.intent.secondary.is_empty() { + // We can only do useful work if we have both attached and secondary locations: this + // function doesn't schedule new locations, only swaps between attached and secondaries. + return None; + } + + let current_affinity_score = schedule_context.get_node_affinity(attached); + let current_attachment_count = schedule_context.get_node_attachments(attached); + + // Generate score for each node, dropping any un-schedulable nodes. + let all_pageservers = self.intent.all_pageservers(); + let mut scores = all_pageservers + .iter() + .flat_map(|node_id| { + if matches!( + nodes + .get(node_id) + .map(|n| n.may_schedule()) + .unwrap_or(MaySchedule::No), + MaySchedule::No + ) { + None + } else { + let affinity_score = schedule_context.get_node_affinity(*node_id); + let attachment_count = schedule_context.get_node_attachments(*node_id); + Some((*node_id, affinity_score, attachment_count)) + } + }) + .collect::>(); + + // Sort precedence: + // 1st - prefer nodes with the lowest total affinity score + // 2nd - prefer nodes with the lowest number of attachments in this context + // 3rd - if all else is equal, sort by node ID for determinism in tests. + scores.sort_by_key(|i| (i.1, i.2, i.0)); + + if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) = + scores.first() + { + if attached != *preferred_node { + // The best alternative must be more than 1 better than us, otherwise we could end + // up flapping back next time we're called (e.g. there's no point migrating from + // a location with score 1 to a score zero, because on next location the situation + // would be the same, but in reverse). + if current_affinity_score > *preferred_affinity_score + AffinityScore(1) + || current_attachment_count > *preferred_attachment_count + 1 + { + tracing::info!( + "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})", + self.intent.get_secondary() + ); + return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment { + old_attached_node_id: attached, + new_attached_node_id: *preferred_node, + })); + } + } else { + tracing::debug!( + "Node {} is already preferred (score {:?})", + preferred_node, + preferred_affinity_score + ); + } + } + + // Fall-through: we didn't find an optimization + None + } + + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn optimize_secondary( + &self, + scheduler: &Scheduler, + schedule_context: &ScheduleContext, + ) -> Option { + if self.intent.secondary.is_empty() { + // We can only do useful work if we have both attached and secondary locations: this + // function doesn't schedule new locations, only swaps between attached and secondaries. + return None; + } + + for secondary in self.intent.get_secondary() { + let Some(affinity_score) = schedule_context.nodes.get(secondary) else { + // We're already on a node unaffected any affinity constraints, + // so we won't change it. + continue; + }; + + // Let the scheduler suggest a node, where it would put us if we were scheduling afresh + // This implicitly limits the choice to nodes that are available, and prefers nodes + // with lower utilization. + let Ok(candidate_node) = + scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context) + else { + // A scheduling error means we have no possible candidate replacements + continue; + }; + + let candidate_affinity_score = schedule_context + .nodes + .get(&candidate_node) + .unwrap_or(&AffinityScore::FREE); + + // The best alternative must be more than 1 better than us, otherwise we could end + // up flapping back next time we're called. + if *candidate_affinity_score + AffinityScore(1) < *affinity_score { + // If some other node is available and has a lower score than this node, then + // that other node is a good place to migrate to. + tracing::info!( + "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})", + self.intent.get_secondary() + ); + return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary { + old_node_id: *secondary, + new_node_id: candidate_node, + })); + } + } + + None + } + + pub(crate) fn apply_optimization( + &mut self, + scheduler: &mut Scheduler, + optimization: ScheduleOptimization, + ) { + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_schedule_optimization + .inc(); + + match optimization { + ScheduleOptimization::MigrateAttachment(MigrateAttachment { + old_attached_node_id, + new_attached_node_id, + }) => { + self.intent.demote_attached(old_attached_node_id); + self.intent + .promote_attached(scheduler, new_attached_node_id); + } + ScheduleOptimization::ReplaceSecondary(ReplaceSecondary { + old_node_id, + new_node_id, + }) => { + self.intent.remove_secondary(scheduler, old_node_id); + self.intent.push_secondary(scheduler, new_node_id); + } + } + } + /// Query whether the tenant's observed state for attached node matches its intent state, and if so, /// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there. @@ -668,6 +891,19 @@ impl TenantState { } } + // Pre-checks done: finally check whether we may actually do the work + match self.scheduling_policy { + ShardSchedulingPolicy::Active + | ShardSchedulingPolicy::Essential + | ShardSchedulingPolicy::Pause => {} + ShardSchedulingPolicy::Stop => { + // We only reach this point if there is work to do and we're going to skip + // doing it: warn it obvious why this tenant isn't doing what it ought to. + tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy); + return None; + } + } + // Build list of nodes from which the reconciler should detach let mut detach = Vec::new(); for node_id in self.observed.locations.keys() { @@ -804,6 +1040,22 @@ impl TenantState { }) } + /// Get a waiter for any reconciliation in flight, but do not start reconciliation + /// if it is not already running + pub(crate) fn get_waiter(&self) -> Option { + if self.reconciler.is_some() { + Some(ReconcilerWaiter { + tenant_shard_id: self.tenant_shard_id, + seq_wait: self.waiter.clone(), + error_seq_wait: self.error_waiter.clone(), + error: self.last_error.clone(), + seq: self.sequence, + }) + } else { + None + } + } + /// Called when a ReconcileResult has been emitted and the service is updating /// our state: if the result is from a sequence >= my ReconcileHandle, then drop /// the handle to indicate there is no longer a reconciliation in progress. @@ -829,6 +1081,40 @@ impl TenantState { debug_assert!(!self.intent.all_pageservers().contains(&node_id)); } + pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) { + self.scheduling_policy = p; + } + + pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy { + &self.scheduling_policy + } + + pub(crate) fn from_persistent( + tsp: TenantShardPersistence, + intent: IntentState, + ) -> anyhow::Result { + let tenant_shard_id = tsp.get_tenant_shard_id()?; + let shard_identity = tsp.get_shard_identity()?; + + Ok(Self { + tenant_shard_id, + shard: shard_identity, + sequence: Sequence::initial(), + generation: tsp.generation.map(|g| Generation::new(g as u32)), + policy: serde_json::from_str(&tsp.placement_policy).unwrap(), + intent, + observed: ObservedState::new(), + config: serde_json::from_str(&tsp.config).unwrap(), + reconciler: None, + splitting: tsp.splitting, + waiter: Arc::new(SeqWait::new(Sequence::initial())), + error_waiter: Arc::new(SeqWait::new(Sequence::initial())), + last_error: Arc::default(), + pending_compute_notification: false, + scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), + }) + } + pub(crate) fn to_persistent(&self) -> TenantShardPersistence { TenantShardPersistence { tenant_id: self.tenant_shard_id.tenant_id.to_string(), @@ -840,6 +1126,7 @@ impl TenantState { placement_policy: serde_json::to_string(&self.policy).unwrap(), config: serde_json::to_string(&self.config).unwrap(), splitting: SplitState::default(), + scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(), } } } @@ -856,7 +1143,7 @@ pub(crate) mod tests { use super::*; - fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState { + fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard { let tenant_id = TenantId::generate(); let shard_number = ShardNumber(0); let shard_count = ShardCount::new(1); @@ -866,7 +1153,7 @@ pub(crate) mod tests { shard_number, shard_count, }; - TenantState::new( + TenantShard::new( tenant_shard_id, ShardIdentity::new( shard_number, @@ -878,6 +1165,32 @@ pub(crate) mod tests { ) } + fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec { + let tenant_id = TenantId::generate(); + + (0..shard_count.count()) + .map(|i| { + let shard_number = ShardNumber(i); + + let tenant_shard_id = TenantShardId { + tenant_id, + shard_number, + shard_count, + }; + TenantShard::new( + tenant_shard_id, + ShardIdentity::new( + shard_number, + shard_count, + pageserver_api::shard::ShardStripeSize(32768), + ) + .unwrap(), + policy.clone(), + ) + }) + .collect() + } + /// Test the scheduling behaviors used when a tenant configured for HA is subject /// to nodes being marked offline. #[test] @@ -887,25 +1200,26 @@ pub(crate) mod tests { let mut nodes = make_test_nodes(3); let mut scheduler = Scheduler::new(nodes.values()); + let mut context = ScheduleContext::default(); - let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1)); - tenant_state - .schedule(&mut scheduler) + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); + tenant_shard + .schedule(&mut scheduler, &mut context) .expect("we have enough nodes, scheduling should work"); // Expect to initially be schedule on to different nodes - assert_eq!(tenant_state.intent.secondary.len(), 1); - assert!(tenant_state.intent.attached.is_some()); + assert_eq!(tenant_shard.intent.secondary.len(), 1); + assert!(tenant_shard.intent.attached.is_some()); - let attached_node_id = tenant_state.intent.attached.unwrap(); - let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap(); + let attached_node_id = tenant_shard.intent.attached.unwrap(); + let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap(); assert_ne!(attached_node_id, secondary_node_id); // Notifying the attached node is offline should demote it to a secondary - let changed = tenant_state.intent.demote_attached(attached_node_id); + let changed = tenant_shard.intent.demote_attached(attached_node_id); assert!(changed); - assert!(tenant_state.intent.attached.is_none()); - assert_eq!(tenant_state.intent.secondary.len(), 2); + assert!(tenant_shard.intent.attached.is_none()); + assert_eq!(tenant_shard.intent.secondary.len(), 2); // Update the scheduler state to indicate the node is offline nodes @@ -915,18 +1229,18 @@ pub(crate) mod tests { scheduler.node_upsert(nodes.get(&attached_node_id).unwrap()); // Scheduling the node should promote the still-available secondary node to attached - tenant_state - .schedule(&mut scheduler) + tenant_shard + .schedule(&mut scheduler, &mut context) .expect("active nodes are available"); - assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id); + assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id); // The original attached node should have been retained as a secondary assert_eq!( - *tenant_state.intent.secondary.iter().last().unwrap(), + *tenant_shard.intent.secondary.iter().last().unwrap(), attached_node_id ); - tenant_state.intent.clear(&mut scheduler); + tenant_shard.intent.clear(&mut scheduler); Ok(()) } @@ -936,48 +1250,263 @@ pub(crate) mod tests { let nodes = make_test_nodes(3); let mut scheduler = Scheduler::new(nodes.values()); - let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1)); + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); - tenant_state.observed.locations.insert( + tenant_shard.observed.locations.insert( NodeId(3), ObservedStateLocation { conf: Some(LocationConfig { mode: LocationConfigMode::AttachedMulti, generation: Some(2), secondary_conf: None, - shard_number: tenant_state.shard.number.0, - shard_count: tenant_state.shard.count.literal(), - shard_stripe_size: tenant_state.shard.stripe_size.0, + shard_number: tenant_shard.shard.number.0, + shard_count: tenant_shard.shard.count.literal(), + shard_stripe_size: tenant_shard.shard.stripe_size.0, tenant_conf: TenantConfig::default(), }), }, ); - tenant_state.observed.locations.insert( + tenant_shard.observed.locations.insert( NodeId(2), ObservedStateLocation { conf: Some(LocationConfig { mode: LocationConfigMode::AttachedStale, generation: Some(1), secondary_conf: None, - shard_number: tenant_state.shard.number.0, - shard_count: tenant_state.shard.count.literal(), - shard_stripe_size: tenant_state.shard.stripe_size.0, + shard_number: tenant_shard.shard.number.0, + shard_count: tenant_shard.shard.count.literal(), + shard_stripe_size: tenant_shard.shard.stripe_size.0, tenant_conf: TenantConfig::default(), }), }, ); - tenant_state.intent_from_observed(&mut scheduler); + tenant_shard.intent_from_observed(&mut scheduler); // The highest generationed attached location gets used as attached - assert_eq!(tenant_state.intent.attached, Some(NodeId(3))); + assert_eq!(tenant_shard.intent.attached, Some(NodeId(3))); // Other locations get used as secondary - assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]); + assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]); - scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?; + scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?; + + tenant_shard.intent.clear(&mut scheduler); + Ok(()) + } + + #[test] + fn scheduling_mode() -> anyhow::Result<()> { + let nodes = make_test_nodes(3); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); + + // In pause mode, schedule() shouldn't do anything + tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause; + assert!(tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok()); + assert!(tenant_shard.intent.all_pageservers().is_empty()); + + // In active mode, schedule() works + tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active; + assert!(tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok()); + assert!(!tenant_shard.intent.all_pageservers().is_empty()); + + tenant_shard.intent.clear(&mut scheduler); + Ok(()) + } + + #[test] + fn optimize_attachment() -> anyhow::Result<()> { + let nodes = make_test_nodes(3); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); + + // Initially: both nodes attached on shard 1, and both have secondary locations + // on different nodes. + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(2)); + shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1))); + shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); + + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); + schedule_context.avoid(&shard_b.intent.all_pageservers()); + schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + + let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context); + + // Either shard should recognize that it has the option to switch to a secondary location where there + // would be no other shards from the same tenant, and request to do so. + assert_eq!( + optimization_a, + Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(1), + new_attached_node_id: NodeId(2) + })) + ); + + // Note that these optimizing two shards in the same tenant with the same ScheduleContext is + // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility + // of [`Service::optimize_all`] to avoid trying + // to do optimizations for multiple shards in the same tenant at the same time. Generating + // both optimizations is just done for test purposes + let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context); + assert_eq!( + optimization_b, + Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(1), + new_attached_node_id: NodeId(3) + })) + ); + + // Applying these optimizations should result in the end state proposed + shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap()); + assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2))); + assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]); + shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap()); + assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3))); + assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]); + + shard_a.intent.clear(&mut scheduler); + shard_b.intent.clear(&mut scheduler); + + Ok(()) + } + + #[test] + fn optimize_secondary() -> anyhow::Result<()> { + let nodes = make_test_nodes(4); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); + + // Initially: both nodes attached on shard 1, and both have secondary locations + // on different nodes. + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(3)); + shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2))); + shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); + + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); + schedule_context.avoid(&shard_b.intent.all_pageservers()); + schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + + let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context); + + // Since there is a node with no locations available, the node with two locations for the + // same tenant should generate an optimization to move one away + assert_eq!( + optimization_a, + Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary { + old_node_id: NodeId(3), + new_node_id: NodeId(4) + })) + ); + + shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap()); + assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1))); + assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]); + + shard_a.intent.clear(&mut scheduler); + shard_b.intent.clear(&mut scheduler); + + Ok(()) + } + + // Optimize til quiescent: this emulates what Service::optimize_all does, when + // called repeatedly in the background. + fn optimize_til_idle( + nodes: &HashMap, + scheduler: &mut Scheduler, + shards: &mut [TenantShard], + ) { + let mut loop_n = 0; + loop { + let mut schedule_context = ScheduleContext::default(); + let mut any_changed = false; + + for shard in shards.iter() { + schedule_context.avoid(&shard.intent.all_pageservers()); + if let Some(attached) = shard.intent.get_attached() { + schedule_context.push_attached(*attached); + } + } + + for shard in shards.iter_mut() { + let optimization = shard.optimize_attachment(nodes, &schedule_context); + if let Some(optimization) = optimization { + shard.apply_optimization(scheduler, optimization); + any_changed = true; + break; + } + + let optimization = shard.optimize_secondary(scheduler, &schedule_context); + if let Some(optimization) = optimization { + shard.apply_optimization(scheduler, optimization); + any_changed = true; + break; + } + } + + if !any_changed { + break; + } + + // Assert no infinite loop + loop_n += 1; + assert!(loop_n < 1000); + } + } + + /// Test the balancing behavior of shard scheduling: that it achieves a balance, and + /// that it converges. + #[test] + fn optimize_add_nodes() -> anyhow::Result<()> { + let nodes = make_test_nodes(4); + + // Only show the scheduler a couple of nodes + let mut scheduler = Scheduler::new([].iter()); + scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); + scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap()); + + let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4)); + let mut schedule_context = ScheduleContext::default(); + for shard in &mut shards { + assert!(shard + .schedule(&mut scheduler, &mut schedule_context) + .is_ok()); + } + + // We should see equal number of locations on the two nodes. + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4); + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4); + + // Add another two nodes: we should see the shards spread out when their optimize + // methods are called + scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap()); + scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap()); + optimize_til_idle(&nodes, &mut scheduler, &mut shards); + + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2); + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); + assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2); + assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2); + + for shard in shards.iter_mut() { + shard.intent.clear(&mut scheduler); + } - tenant_state.intent.clear(&mut scheduler); Ok(()) } } diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index e7959c1764..c32748f6f0 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -482,20 +482,18 @@ def pytest_terminal_summary( terminalreporter.section("Benchmark results", "-") is_header_printed = True - terminalreporter.write( - "{}.{}: ".format(test_report.head_line, recorded_property["name"]) - ) + terminalreporter.write(f"{test_report.head_line}.{recorded_property['name']}: ") unit = recorded_property["unit"] value = recorded_property["value"] if unit == "MB": - terminalreporter.write("{0:,.0f}".format(value), green=True) + terminalreporter.write(f"{value:,.0f}", green=True) elif unit in ("s", "ms") and isinstance(value, float): - terminalreporter.write("{0:,.3f}".format(value), green=True) + terminalreporter.write(f"{value:,.3f}", green=True) elif isinstance(value, float): - terminalreporter.write("{0:,.4f}".format(value), green=True) + terminalreporter.write(f"{value:,.4f}", green=True) else: terminalreporter.write(str(value), green=True) - terminalreporter.line(" {}".format(unit)) + terminalreporter.line(f" {unit}") result_entry.append(recorded_property) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 3d60f9bef5..c2c661088b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -520,9 +520,9 @@ class NeonEnvBuilder: self.env = NeonEnv(self) return self.env - def start(self, register_pageservers=False): + def start(self): assert self.env is not None, "environment is not already initialized, call init() first" - self.env.start(register_pageservers=register_pageservers) + self.env.start() def init_start( self, @@ -1115,8 +1115,8 @@ class NeonEnv: log.info(f"Config: {cfg}") self.neon_cli.init(cfg, force=config.config_init_force) - def start(self, register_pageservers=False): - # storage controller starts first, so that pageserver /re-attach calls don't + def start(self): + # Storage controller starts first, so that pageserver /re-attach calls don't # bounce through retries on startup self.storage_controller.start() @@ -1127,11 +1127,6 @@ class NeonEnv: # reconcile. wait_until(30, 1, storage_controller_ready) - if register_pageservers: - # Special case for forward compat tests, this can be removed later. - for pageserver in self.pageservers: - self.storage_controller.node_register(pageserver) - # Start up broker, pageserver and all safekeepers futs = [] with concurrent.futures.ThreadPoolExecutor( @@ -2116,6 +2111,7 @@ class NeonStorageController(MetricsGetter): shard_count: Optional[int] = None, shard_stripe_size: Optional[int] = None, tenant_config: Optional[Dict[Any, Any]] = None, + placement_policy: Optional[str] = None, ): """ Use this rather than pageserver_api() when you need to include shard parameters @@ -2135,6 +2131,8 @@ class NeonStorageController(MetricsGetter): for k, v in tenant_config.items(): body[k] = v + body["placement_policy"] = placement_policy + response = self.request( "POST", f"{self.env.storage_controller_api}/v1/tenant", @@ -2193,6 +2191,34 @@ class NeonStorageController(MetricsGetter): log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}") assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id + def tenant_policy_update(self, tenant_id: TenantId, body: dict[str, Any]): + log.info(f"tenant_policy_update({tenant_id}, {body})") + self.request( + "PUT", + f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/policy", + json=body, + headers=self.headers(TokenScope.ADMIN), + ) + + def reconcile_all(self): + r = self.request( + "POST", + f"{self.env.storage_controller_api}/debug/v1/reconcile_all", + headers=self.headers(TokenScope.ADMIN), + ) + r.raise_for_status() + n = r.json() + log.info(f"reconcile_all waited for {n} shards") + return n + + def reconcile_until_idle(self, timeout_secs=30): + start_at = time.time() + n = 1 + while n > 0: + n = self.reconcile_all() + if time.time() - start_at > timeout_secs: + raise RuntimeError("Timeout in reconcile_until_idle") + def consistency_check(self): """ Throw an exception if the service finds any inconsistencies in its state @@ -2423,10 +2449,12 @@ class NeonPageserver(PgProtocol): if cur_line_no < skip_until_line_no: cur_line_no += 1 continue - if contains_re.search(line): + elif contains_re.search(line): # found it! cur_line_no += 1 return (line, LogCursor(cur_line_no)) + else: + cur_line_no += 1 return None def tenant_attach( @@ -3574,7 +3602,7 @@ class Safekeeper: return self def stop(self, immediate: bool = False) -> "Safekeeper": - log.info("Stopping safekeeper {}".format(self.id)) + log.info(f"Stopping safekeeper {self.id}") self.env.neon_cli.safekeeper_stop(self.id, immediate) self.running = False return self @@ -4006,13 +4034,13 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint for f in mismatch: f1 = os.path.join(endpoint.pgdata_dir, f) f2 = os.path.join(restored_dir_path, f) - stdout_filename = "{}.filediff".format(f2) + stdout_filename = f"{f2}.filediff" with open(stdout_filename, "w") as stdout_f: - subprocess.run("xxd -b {} > {}.hex ".format(f1, f1), shell=True) - subprocess.run("xxd -b {} > {}.hex ".format(f2, f2), shell=True) + subprocess.run(f"xxd -b {f1} > {f1}.hex ", shell=True) + subprocess.run(f"xxd -b {f2} > {f2}.hex ", shell=True) - cmd = "diff {}.hex {}.hex".format(f1, f2) + cmd = f"diff {f1}.hex {f2}.hex" subprocess.run([cmd], stdout=stdout_f, shell=True) assert (mismatch, error) == ([], []) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 6aebfbc99c..b899b0dac8 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -308,6 +308,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): params=params, ) self.verbose_error(res) + return res.json() def tenant_list_locations(self): res = self.get( @@ -341,8 +342,21 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore") self.verbose_error(res) - def tenant_status(self, tenant_id: Union[TenantId, TenantShardId]) -> Dict[Any, Any]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + def tenant_status( + self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False + ) -> Dict[Any, Any]: + """ + :activate: hint the server not to accelerate activation of this tenant in response + to this query. False by default for tests, because they generally want to observed the + system rather than interfering with it. This is true by default on the server side, + because in the field if the control plane is GET'ing a tenant it's a sign that it wants + to do something with it. + """ + params = {} + if not activate: + params["activate"] = "false" + + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}", params=params) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 693771dd3d..4b0dd7a815 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -204,13 +204,11 @@ def wait_for_last_record_lsn( return current_lsn if i % 10 == 0: log.info( - "{}/{} waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - tenant, timeline, lsn, current_lsn, i + 1 - ) + f"{tenant}/{timeline} waiting for last_record_lsn to reach {lsn}, now {current_lsn}, iteration {i + 1}" ) time.sleep(0.1) raise Exception( - "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn) + f"timed out while waiting for last_record_lsn to reach {lsn}, was {current_lsn}" ) diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index ab8717de54..c44628ce06 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -81,9 +81,13 @@ class Workload: return self._endpoint - def __del__(self): + def stop(self): if self._endpoint is not None: self._endpoint.stop() + self._endpoint = None + + def __del__(self): + self.stop() def init(self, pageserver_id: Optional[int] = None): endpoint = self.endpoint(pageserver_id) diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index 324ef0d516..b66db4d0ab 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -125,19 +125,19 @@ async def run_update_loop_worker(ep: Endpoint, n_txns: int, idx: int): await conn.execute(f"ALTER TABLE {table} SET (autovacuum_enabled = false)") await conn.execute(f"INSERT INTO {table} VALUES (1, 0)") await conn.execute( + f""" + CREATE PROCEDURE updating{table}() as + $$ + DECLARE + i integer; + BEGIN + FOR i IN 1..{n_txns} LOOP + UPDATE {table} SET x = x + 1 WHERE pk=1; + COMMIT; + END LOOP; + END + $$ LANGUAGE plpgsql """ - CREATE PROCEDURE updating{0}() as - $$ - DECLARE - i integer; - BEGIN - FOR i IN 1..{1} LOOP - UPDATE {0} SET x = x + 1 WHERE pk=1; - COMMIT; - END LOOP; - END - $$ LANGUAGE plpgsql - """.format(table, n_txns) ) await conn.execute("SET statement_timeout=0") await conn.execute(f"call updating{table}()") diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 9777bf6748..54905759bd 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -78,7 +78,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) p = random.randint(0, i) timer = timeit.default_timer() - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p), tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", f"b{p}", tenant_id=tenant) dur = timeit.default_timer() - timer log.info(f"Creating branch b{i+1} took {dur}s") diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 3058926b25..909d25980b 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -189,6 +189,7 @@ def test_fully_custom_config(positive_env: NeonEnv): }, "trace_read_requests": True, "walreceiver_connect_timeout": "13m", + "image_layer_creation_check_threshold": 1, } ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 2a7a3c41ac..5b69649007 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -84,11 +84,11 @@ def test_branching_with_pgbench( threads = [] if ty == "cascade": - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(i), tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", f"b{i}", tenant_id=tenant) else: - env.neon_cli.create_branch("b{}".format(i + 1), "b0", tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", "b0", tenant_id=tenant) - endpoints.append(env.endpoints.create_start("b{}".format(i + 1), tenant_id=tenant)) + endpoints.append(env.endpoints.create_start(f"b{i + 1}", tenant_id=tenant)) threads.append( threading.Thread(target=run_pgbench, args=(endpoints[-1].connstr(),), daemon=True) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 5406acc005..ddad98a5fa 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -226,10 +226,6 @@ def test_forward_compatibility( ) try: - # TODO: remove this once the previous pageserrver version understands - # the 'get_vectored_impl' config - neon_env_builder.pageserver_get_vectored_impl = None - neon_env_builder.num_safekeepers = 3 neon_local_binpath = neon_env_builder.neon_binpath env = neon_env_builder.from_repo_dir( @@ -238,15 +234,11 @@ def test_forward_compatibility( pg_distrib_dir=compatibility_postgres_distrib_dir, ) - # TODO: remove this workaround after release-5090 is no longer the most recent release. - # There was a bug in that code that generates a warning in the storage controller log. - env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*") - # Use current neon_local even though we're using old binaries for # everything else: our test code is written for latest CLI args. env.neon_local_binpath = neon_local_binpath - neon_env_builder.start(register_pageservers=True) + neon_env_builder.start() check_neon_works( env, diff --git a/test_runner/regress/test_large_schema.py b/test_runner/regress/test_large_schema.py index b6ac1aa41f..c5d5b5fe64 100644 --- a/test_runner/regress/test_large_schema.py +++ b/test_runner/regress/test_large_schema.py @@ -74,8 +74,8 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid") # Check layer file sizes - timeline_path = "{}/tenants/{}/timelines/{}/".format( - env.pageserver.workdir, env.initial_tenant, env.initial_timeline + timeline_path = ( + f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{env.initial_timeline}/" ) for filename in os.listdir(timeline_path): if filename.startswith("00000"): diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py index 2fdee89389..77dc8a35b5 100644 --- a/test_runner/regress/test_layer_bloating.py +++ b/test_runner/regress/test_layer_bloating.py @@ -57,9 +57,7 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg): time.sleep(10) # Check layer file sizes - timeline_path = "{}/tenants/{}/timelines/{}/".format( - env.pageserver.workdir, env.initial_tenant, timeline - ) + timeline_path = f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{timeline}/" log.info(f"Check {timeline_path}") for filename in os.listdir(timeline_path): if filename.startswith("00000"): diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 7bbc0cc160..fefb30bbdd 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -165,6 +165,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): "compaction_threshold": "3", # "image_creation_threshold": set at runtime "compaction_target_size": f"{128 * (1024**2)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers + "image_layer_creation_check_threshold": "0", # always check if a new image layer can be created } def tenant_update_config(changes): diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index ca4295c5cb..f311a8bf2c 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -53,6 +53,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): "checkpoint_timeout": "24h", # something we won't reach "checkpoint_distance": f"{50 * (1024**2)}", # something we won't reach, we checkpoint manually "image_creation_threshold": "100", # we want to control when image is created + "image_layer_creation_check_threshold": "0", "compaction_threshold": f"{l0_l1_threshold}", "compaction_target_size": f"{128 * (1024**3)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers } diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 3f4ca8070d..1bac528397 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -364,3 +364,67 @@ def test_slots_and_branching(neon_simple_env: NeonEnv): # Check that we can create slot with the same name ws_cur = ws_branch.connect().cursor() ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") + + +def test_replication_shutdown(neon_simple_env: NeonEnv): + # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed + env = neon_simple_env + env.neon_cli.create_branch("test_replication_shutdown_publisher", "empty") + pub = env.endpoints.create("test_replication_shutdown_publisher") + + env.neon_cli.create_branch("test_replication_shutdown_subscriber") + sub = env.endpoints.create("test_replication_shutdown_subscriber") + + pub.respec(skip_pg_catalog_updates=False) + pub.start() + + sub.respec(skip_pg_catalog_updates=False) + sub.start() + + pub.wait_for_migrations() + sub.wait_for_migrations() + + with pub.cursor() as cur: + cur.execute( + "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" + ) + cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") + cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") + + # If we don't do this, creating the subscription will fail later on PG16 + pub.edit_hba(["host all mr_whiskers 0.0.0.0/0 md5"]) + + with sub.cursor() as cur: + cur.execute( + "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" + ) + cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") + cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") + + with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + cur.execute("CREATE PUBLICATION pub FOR ALL TABLES") + cur.execute("CREATE TABLE t (a int)") + cur.execute("INSERT INTO t VALUES (10), (20)") + cur.execute("SELECT * from t") + res = cur.fetchall() + assert [r[0] for r in res] == [10, 20] + + with sub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + cur.execute("CREATE TABLE t (a int)") + + pub_conn = f"host=localhost port={pub.pg_port} dbname=neondb user=mr_whiskers password=cat" + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" + log.info(f"Creating subscription: {query}") + cur.execute(query) + + with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as pcur: + pcur.execute("INSERT INTO t VALUES (30), (40)") + + def check_that_changes_propagated(): + cur.execute("SELECT * FROM t") + res = cur.fetchall() + log.info(res) + assert len(res) == 4 + assert [r[0] for r in res] == [10, 20, 30, 40] + + wait_until(10, 0.5, check_that_changes_propagated) diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index e31e1cab51..39b4865026 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -1,3 +1,4 @@ +import time from contextlib import closing from fixtures.log_helper import log @@ -43,6 +44,12 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: + cur.execute("SELECT extversion from pg_extension where extname='neon'") + # IMPORTANT: + # If the version has changed, the test should be updated. + # Ensure that the default version is also updated in the neon.control file + assert cur.fetchone() == ("1.3",) + cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") all_versions = ["1.3", "1.2", "1.1", "1.0"] current_version = "1.3" for idx, begin_version in enumerate(all_versions): @@ -60,3 +67,30 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): cur.execute( f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {target_version}->{begin_version}" ) + + +# Verify that the neon extension can be auto-upgraded to the latest version. +def test_neon_extension_auto_upgrade(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_neon_extension_auto_upgrade") + + endpoint_main = env.endpoints.create("test_neon_extension_auto_upgrade") + # don't skip pg_catalog updates - it runs CREATE EXTENSION neon + endpoint_main.respec(skip_pg_catalog_updates=False) + endpoint_main.start() + + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + cur.execute("ALTER EXTENSION neon UPDATE TO '1.0';") + cur.execute("SELECT extversion from pg_extension where extname='neon'") + assert cur.fetchone() == ("1.0",) # Ensure the extension gets downgraded + + endpoint_main.stop() + time.sleep(1) + endpoint_main.start() + time.sleep(1) + + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SELECT extversion from pg_extension where extname='neon'") + assert cur.fetchone() != ("1.0",) # Ensure the extension gets upgraded diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 914f068afb..ba0d53704b 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -568,6 +568,8 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne "image_creation_threshold": 100, # repartitioning parameter, unused "compaction_target_size": 128 * 1024**2, + # Always check if a new image layer can be created + "image_layer_creation_check_threshold": 0, # pitr_interval and gc_horizon are not interesting because we dont run gc } @@ -632,7 +634,8 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne # threshold to expose image creation to downloading all of the needed # layers -- threshold of 2 would sound more reasonable, but keeping it as 1 # to be less flaky - env.neon_cli.config_tenant(tenant_id, {"image_creation_threshold": "1"}) + conf["image_creation_threshold"] = "1" + env.neon_cli.config_tenant(tenant_id, {k: str(v) for k, v in conf.items()}) pageserver_http.timeline_compact(tenant_id, timeline_id) layers = pageserver_http.layer_map_info(tenant_id, timeline_id) diff --git a/test_runner/regress/test_pageserver_config.py b/test_runner/regress/test_pageserver_config.py new file mode 100644 index 0000000000..c04348b488 --- /dev/null +++ b/test_runner/regress/test_pageserver_config.py @@ -0,0 +1,35 @@ +import pytest +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + last_flush_lsn_upload, +) + + +@pytest.mark.parametrize("kind", ["sync", "async"]) +def test_walredo_process_kind_config(neon_env_builder: NeonEnvBuilder, kind: str): + neon_env_builder.pageserver_config_override = f"walredo_process_kind = '{kind}'" + # ensure it starts + env = neon_env_builder.init_start() + # ensure the metric is set + ps_http = env.pageserver.http_client() + metrics = ps_http.get_metrics() + samples = metrics.query_all("pageserver_wal_redo_process_kind") + assert [(s.labels, s.value) for s in samples] == [({"kind": kind}, 1)] + # ensure default tenant's config kind matches + # => write some data to force-spawn walredo + ep = env.endpoints.create_start("main") + with ep.connect() as conn: + with conn.cursor() as cur: + cur.execute("create table foo(bar text)") + cur.execute("insert into foo select from generate_series(1, 100)") + last_flush_lsn_upload(env, ep, env.initial_tenant, env.initial_timeline) + ep.stop() + ep.start() + with ep.connect() as conn: + with conn.cursor() as cur: + cur.execute("select count(*) from foo") + [(count,)] = cur.fetchall() + assert count == 100 + + status = ps_http.tenant_status(env.initial_tenant) + assert status["walredo"]["process"]["kind"] == kind diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 56b4548b64..67f68a62af 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -9,7 +9,6 @@ of the pageserver are: - Updates to remote_consistent_lsn may only be made visible after validating generation """ - import enum import re import time @@ -23,6 +22,7 @@ from fixtures.neon_fixtures import ( NeonPageserver, PgBin, S3Scrubber, + flush_ep_to_pageserver, last_flush_lsn_upload, ) from fixtures.pageserver.http import PageserverApiException @@ -31,6 +31,7 @@ from fixtures.pageserver.utils import ( list_prefix, wait_for_last_record_lsn, wait_for_upload, + wait_for_upload_queue_empty, ) from fixtures.remote_storage import ( RemoteStorageKind, @@ -53,6 +54,7 @@ TENANT_CONF = { "compaction_period": "0s", # create image layers eagerly, so that GC can remove some layers "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", } @@ -111,7 +113,6 @@ def generate_uploads_and_deletions( last_flush_lsn_upload( env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id ) - ps_http.timeline_checkpoint(tenant_id, timeline_id) # Compaction should generate some GC-elegible layers for i in range(0, 2): @@ -121,6 +122,17 @@ def generate_uploads_and_deletions( print_gc_result(gc_result) assert gc_result["layers_removed"] > 0 + # Stop endpoint and flush all data to pageserver, then checkpoint it: this + # ensures that the pageserver is in a fully idle state: there will be no more + # background ingest, no more uploads pending, and therefore no non-determinism + # in subsequent actions like pageserver restarts. + final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + # Finish uploads + wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn) + # Finish all remote writes (including deletions) + wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) + def read_all( env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None @@ -385,9 +397,8 @@ def test_deletion_queue_recovery( if validate_before == ValidateBefore.NO_VALIDATE: failpoints.append( # Prevent deletion lists from being validated, we will test that they are - # dropped properly during recovery. 'pause' is okay here because we kill - # the pageserver with immediate=true - ("control-plane-client-validate", "pause") + # dropped properly during recovery. This is such a long sleep as to be equivalent to "never" + ("control-plane-client-validate", "return(3600000)") ) ps_http.configure_failpoints(failpoints) diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index c7e1e88468..c5dc0f2919 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -1,6 +1,7 @@ import asyncio import os -from typing import Tuple +import time +from typing import Optional, Tuple import psutil import pytest @@ -20,20 +21,30 @@ ENTRIES_PER_TIMELINE = 10_000 CHECKPOINT_TIMEOUT_SECONDS = 60 -async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: - tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf) +async def run_worker_for_tenant( + env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None +) -> Lsn: + if offset is None: + offset = 0 + with env.endpoints.create_start("main", tenant_id=tenant) as ep: conn = await ep.connect_async() try: await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)") await conn.execute( - f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i" + f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series({offset},{entries}) as i" ) finally: await conn.close(timeout=10) last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - return tenant, timeline, last_flush_lsn + return last_flush_lsn + + +async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: + tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf) + last_flush_lsn = await run_worker_for_tenant(env, entries, tenant) + return tenant, timeline, last_flush_lsn async def workload( @@ -89,7 +100,9 @@ def assert_dirty_bytes(env, v): def assert_dirty_bytes_nonzero(env): - assert get_dirty_bytes(env) > 0 + dirty_bytes = get_dirty_bytes(env) + assert dirty_bytes > 0 + return dirty_bytes @pytest.mark.parametrize("immediate_shutdown", [True, False]) @@ -182,6 +195,31 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): log.info("Waiting for background checkpoints...") wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + # The code below verifies that we do not flush on the first write + # after an idle period longer than the checkpoint timeout. + + # Sit quietly for longer than the checkpoint timeout + time.sleep(CHECKPOINT_TIMEOUT_SECONDS + CHECKPOINT_TIMEOUT_SECONDS / 2) + + # Restart the safekeepers and write a bit of extra data into one tenant + for sk in env.safekeepers: + sk.start() + + tenant_with_extra_writes = last_flush_lsns[0][0] + asyncio.run( + run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE) + ) + + dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + + # We shouldn't flush since we've just opened a new layer + waited_for = 0 + while waited_for < CHECKPOINT_TIMEOUT_SECONDS // 4: + time.sleep(5) + waited_for += 5 + + assert get_dirty_bytes(env) >= dirty_after_write + @pytest.mark.skipif( # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index ca6f77c75f..345abdc072 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -498,9 +498,19 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id - ) + try: + assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( + ps_secondary, tenant_id, timeline_id + ) + except: + # Do a full listing of the secondary location on errors, to help debug of + # https://github.com/neondatabase/neon/issues/6966 + timeline_path = ps_secondary.timeline_dir(tenant_id, timeline_id) + for path, _dirs, files in os.walk(timeline_path): + for f in files: + log.info(f"Secondary file: {os.path.join(path, f)}") + + raise # FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while # walreceiver is still doing something. diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 3e986a8f7b..f446f4f200 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -566,38 +566,6 @@ async def test_sql_over_http2(static_proxy: NeonProxy): assert resp["rows"] == [{"answer": 42}] -def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy): - static_proxy.safe_psql("create role http with login password 'http' superuser") - - static_proxy.safe_psql("create table test_table ( id int primary key )") - - # insert into a table, with a unique constraint, after sleeping for n seconds - query = "WITH temp AS ( \ - SELECT pg_sleep($1) as sleep, $2::int as id \ - ) INSERT INTO test_table (id) SELECT id FROM temp" - - # expect to fail with timeout - res = static_proxy.http_query( - query, - [static_proxy.http_timeout_seconds + 1, 1], - user="http", - password="http", - expected_code=400, - ) - assert "Query cancelled, runtime exceeded" in res["message"], "HTTP query should time out" - - time.sleep(2) - - res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200) - assert res["command"] == "INSERT", "HTTP query should insert" - assert res["rowCount"] == 1, "HTTP query should insert" - - res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400) - assert ( - "duplicate key value violates unique constraint" in res["message"] - ), "HTTP query should conflict" - - def test_sql_over_http_connection_cancel(static_proxy: NeonProxy): static_proxy.safe_psql("create role http with login password 'http' superuser") diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py deleted file mode 100644 index f39f0cad07..0000000000 --- a/test_runner/regress/test_proxy_rate_limiter.py +++ /dev/null @@ -1,84 +0,0 @@ -import asyncio -import time -from pathlib import Path -from typing import Iterator - -import pytest -from fixtures.neon_fixtures import ( - PSQL, - NeonProxy, -) -from fixtures.port_distributor import PortDistributor -from pytest_httpserver import HTTPServer -from werkzeug.wrappers.response import Response - - -def waiting_handler(status_code: int) -> Response: - # wait more than timeout to make sure that both (two) connections are open. - # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver. - time.sleep(2) - return Response(status=status_code) - - -@pytest.fixture(scope="function") -def proxy_with_rate_limit( - port_distributor: PortDistributor, - neon_binpath: Path, - httpserver_listen_address, - test_output_dir: Path, -) -> Iterator[NeonProxy]: - """Neon proxy that routes directly to vanilla postgres.""" - - proxy_port = port_distributor.get_port() - mgmt_port = port_distributor.get_port() - http_port = port_distributor.get_port() - external_http_port = port_distributor.get_port() - (host, port) = httpserver_listen_address - endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" - - with NeonProxy( - neon_binpath=neon_binpath, - test_output_dir=test_output_dir, - proxy_port=proxy_port, - http_port=http_port, - mgmt_port=mgmt_port, - external_http_port=external_http_port, - auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5), - ) as proxy: - proxy.start() - yield proxy - - -@pytest.mark.asyncio -async def test_proxy_rate_limit( - httpserver: HTTPServer, - proxy_with_rate_limit: NeonProxy, -): - uri = "/billing/api/v1/usage_events/proxy_get_role_secret" - # mock control plane service - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: Response(status=200) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(429) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(500) - ) - - psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port) - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - # Limit should be 2. - - # Run two queries in parallel. - f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;")) - await proxy_with_rate_limit.find_auth_link(uri, f1) - await proxy_with_rate_limit.find_auth_link(uri, f2) - - # Now limit should be 0. - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - - # There last query shouldn't reach the http-server. - assert httpserver.assertions == [] diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index effb7e83f9..868b80a561 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -22,7 +22,7 @@ def test_read_validation(neon_simple_env: NeonEnv): with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: - c.execute("create extension if not exists {};".format(e)) + c.execute(f"create extension if not exists {e};") c.execute("create table foo (c int) with (autovacuum_enabled = false)") c.execute("insert into foo values (1)") @@ -42,14 +42,12 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Test table is populated, validating buffer cache") cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries > 0, "No buffers cached for the test relation" c.execute( - "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}".format( - relfilenode - ) + f"select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {relfilenode}" ) reln = c.fetchone() assert reln is not None @@ -59,22 +57,20 @@ def test_read_validation(neon_simple_env: NeonEnv): c.execute("select clear_buffer_cache()") cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "Failed to clear buffer cache" log.info("Cache is clear, reading stale page version") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{}'))".format( - first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}'))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn" cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "relation buffers detected after invalidation" @@ -87,7 +83,7 @@ def test_read_validation(neon_simple_env: NeonEnv): assert second == direct_latest, "Failed fetch page at latest lsn" cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "relation buffers detected after invalidation" @@ -96,9 +92,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( - reln[0], reln[1], reln[2], first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" @@ -108,9 +102,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, NULL ))".format( - reln[0], reln[1], reln[2] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL))" ) direct_latest = c.fetchone() assert second == direct_latest, "Failed fetch page at latest lsn" @@ -122,9 +114,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( - reln[0], reln[1], reln[2], first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" @@ -134,7 +124,7 @@ def test_read_validation(neon_simple_env: NeonEnv): c.execute("select * from page_header(get_raw_page('foo', 'main', 0));") raise AssertionError("query should have failed") except UndefinedTable as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") def test_read_validation_neg(neon_simple_env: NeonEnv): @@ -148,7 +138,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: - c.execute("create extension if not exists {};".format(e)) + c.execute(f"create extension if not exists {e};") log.info("read a page of a missing relation") try: @@ -157,7 +147,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): ) raise AssertionError("query should have failed") except UndefinedTable as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") c.execute("create table foo (c int) with (autovacuum_enabled = false)") c.execute("insert into foo values (1)") @@ -169,7 +159,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): ) raise AssertionError("query should have failed") except IoError as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") log.info("Pass NULL as an input") expected = (None, None, None) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 986d6c4dbf..47200a856e 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -245,6 +245,7 @@ def test_remote_storage_upload_queue_retries( "compaction_period": "0s", # create image layers eagerly, so that GC can remove some layers "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", } ) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 9aebf16c68..bfaab9125f 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -10,9 +10,13 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + S3Scrubber, StorageControllerApiException, + last_flush_lsn_upload, tenant_get_shards, + wait_for_last_flush_lsn, ) +from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty from fixtures.remote_storage import s3_storage from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import wait_until @@ -67,6 +71,15 @@ def test_sharding_smoke( log.info(f"sizes = {sizes}") return sizes + # The imported initdb for timeline creation should + # not be fully imported on every shard. We use a 1MB strripe size so expect + # pretty good distribution: no one shard should have more than half the data + sizes = get_sizes() + physical_initdb_total = sum(sizes.values()) + expect_initdb_size = 20 * 1024 * 1024 + assert physical_initdb_total > expect_initdb_size + assert all(s < expect_initdb_size // 2 for s in sizes.values()) + # Test that timeline creation works on a sharded tenant timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id) @@ -99,6 +112,38 @@ def test_sharding_smoke( env.storage_controller.consistency_check() + # Validate that deleting a sharded tenant removes all files in the prefix + + # Before deleting, stop the client and check we have some objects to delete + workload.stop() + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + # Check the scrubber isn't confused by sharded content, then disable + # it during teardown because we'll have deleted by then + S3Scrubber(neon_env_builder).scan_metadata() + neon_env_builder.scrub_on_exit = False + + env.storage_controller.pageserver_api().tenant_delete(tenant_id) + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + env.storage_controller.consistency_check() + def test_sharding_split_unsharded( neon_env_builder: NeonEnvBuilder, @@ -146,7 +191,7 @@ def test_sharding_split_smoke( # 8 shards onto separate pageservers shard_count = 4 split_shard_count = 8 - neon_env_builder.num_pageservers = split_shard_count + neon_env_builder.num_pageservers = split_shard_count * 2 # 1MiB stripes: enable getting some meaningful data distribution without # writing large quantities of data in this test. The stripe size is given @@ -174,6 +219,7 @@ def test_sharding_split_smoke( placement_policy='{"Attached": 1}', conf=non_default_tenant_config, ) + workload = Workload(env, tenant_id, timeline_id, branch_name="main") workload.init() @@ -252,6 +298,10 @@ def test_sharding_split_smoke( # The old parent shards should no longer exist on disk assert not shards_on_disk(old_shard_ids) + # Enough background reconciliations should result in the shards being properly distributed. + # Run this before the workload, because its LSN-waiting code presumes stable locations. + env.storage_controller.reconcile_until_idle() + workload.validate() workload.churn_rows(256) @@ -265,27 +315,6 @@ def test_sharding_split_smoke( pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None) workload.validate() - migrate_to_pageserver_ids = list( - set(p.id for p in env.pageservers) - set(pre_split_pageserver_ids) - ) - assert len(migrate_to_pageserver_ids) == split_shard_count - shard_count - - # Migrate shards away from the node where the split happened - for ps_id in pre_split_pageserver_ids: - shards_here = [ - tenant_shard_id - for (tenant_shard_id, pageserver) in all_shards - if pageserver.id == ps_id - ] - assert len(shards_here) == 2 - migrate_shard = shards_here[0] - destination = migrate_to_pageserver_ids.pop() - - log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}") - env.storage_controller.tenant_shard_migrate(migrate_shard, destination) - - workload.validate() - # Assert on how many reconciles happened during the process. This is something of an # implementation detail, but it is useful to detect any bugs that might generate spurious # extra reconcile iterations. @@ -294,8 +323,9 @@ def test_sharding_split_smoke( # - shard_count reconciles for the original setup of the tenant # - shard_count reconciles for detaching the original secondary locations during split # - split_shard_count reconciles during shard splitting, for setting up secondaries. - # - shard_count reconciles for the migrations we did to move child shards away from their split location - expect_reconciles = shard_count * 2 + split_shard_count + shard_count + # - shard_count of the child shards will need to fail over to their secondaries + # - shard_count of the child shard secondary locations will get moved to emptier nodes + expect_reconciles = shard_count * 2 + split_shard_count + shard_count * 2 reconcile_ok = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} ) @@ -343,6 +373,31 @@ def test_sharding_split_smoke( assert sum(total.values()) == split_shard_count * 2 check_effective_tenant_config() + # More specific check: that we are fully balanced. This is deterministic because + # the order in which we consider shards for optimization is deterministic, and the + # order of preference of nodes is also deterministic (lower node IDs win). + log.info(f"total: {total}") + assert total == { + 1: 1, + 2: 1, + 3: 1, + 4: 1, + 5: 1, + 6: 1, + 7: 1, + 8: 1, + 9: 1, + 10: 1, + 11: 1, + 12: 1, + 13: 1, + 14: 1, + 15: 1, + 16: 1, + } + log.info(f"attached: {attached}") + assert attached == {1: 1, 2: 1, 3: 1, 5: 1, 6: 1, 7: 1, 9: 1, 11: 1} + # Ensure post-split pageserver locations survive a restart (i.e. the child shards # correctly wrote config to disk, and the storage controller responds correctly # to /re-attach) @@ -401,6 +456,7 @@ def test_sharding_split_stripe_size( env.storage_controller.tenant_shard_split( tenant_id, shard_count=2, shard_stripe_size=new_stripe_size ) + env.storage_controller.reconcile_until_idle() # Check that we ended up with the stripe size that we expected, both on the pageserver # and in the notifications to compute @@ -455,13 +511,11 @@ def test_sharding_split_stripe_size( os.getenv("BUILD_TYPE") == "debug", reason="Avoid running bulkier ingest tests in debug mode", ) -def test_sharding_ingest( +def test_sharding_ingest_layer_sizes( neon_env_builder: NeonEnvBuilder, ): """ - Check behaviors related to ingest: - - That we generate properly sized layers - - TODO: that updates to remote_consistent_lsn are made correctly via safekeepers + Check that when ingesting data to a sharded tenant, we properly respect layer size limts. """ # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic @@ -492,6 +546,7 @@ def test_sharding_ingest( workload.write_rows(4096, upload=False) workload.write_rows(4096, upload=False) workload.write_rows(4096, upload=False) + workload.validate() small_layer_count = 0 @@ -504,7 +559,9 @@ def test_sharding_ingest( shard_id = shard["shard_id"] layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id) - for layer in layer_map.historic_layers: + historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start) + + for layer in historic_layers: assert layer.layer_file_size is not None if layer.layer_file_size < expect_layer_size // 2: classification = "Small" @@ -541,6 +598,93 @@ def test_sharding_ingest( assert huge_layer_count <= shard_count +def test_sharding_ingest_gaps( + neon_env_builder: NeonEnvBuilder, +): + """ + Check ingest behavior when the incoming data results in some shards having gaps where + no data is ingested: they should advance their disk_consistent_lsn and remote_consistent_lsn + even if they aren't writing out layers. + """ + + # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic + # without writing a lot of data. + expect_layer_size = 131072 + checkpoint_interval_secs = 5 + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{expect_layer_size}", + "compaction_target_size": f"{expect_layer_size}", + # Set a short checkpoint interval as we will wait for uploads to happen + "checkpoint_timeout": f"{checkpoint_interval_secs}s", + # Background checkpointing is done from compaction loop, so set that interval short too + "compaction_period": "1s", + } + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + initial_tenant_shard_stripe_size=128, + ) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Just a few writes: we aim to produce a situation where some shards are skipping + # ingesting some records and thereby won't have layer files that advance their + # consistent LSNs, to exercise the code paths that explicitly handle this case by + # advancing consistent LSNs in the background if there is no open layer. + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(128, upload=False) + workload.churn_rows(128, upload=False) + + # Checkpoint, so that we won't get a background checkpoint happening during the next step + workload.endpoint().safe_psql("checkpoint") + # Freeze + flush, so that subsequent writes will start from a position of no open layers + last_flush_lsn_upload(env, workload.endpoint(), tenant_id, timeline_id) + + # This write is tiny: at least some of the shards should find they don't have any + # data to ingest. This will exercise how they handle that. + workload.churn_rows(1, upload=False) + + # The LSN that has reached pageservers, but may not have been flushed to historic layers yet + expect_lsn = wait_for_last_flush_lsn(env, workload.endpoint(), tenant_id, timeline_id) + + # Don't leave the endpoint running, we don't want it writing in the background + workload.stop() + + log.info(f"Waiting for shards' consistent LSNs to reach {expect_lsn}") + + shards = tenant_get_shards(env, tenant_id, None) + + def assert_all_disk_consistent(): + """ + Assert that all the shards' disk_consistent_lsns have reached expect_lsn + """ + for tenant_shard_id, pageserver in shards: + timeline_detail = pageserver.http_client().timeline_detail(tenant_shard_id, timeline_id) + log.info(f"{tenant_shard_id} (ps {pageserver.id}) detail: {timeline_detail}") + assert Lsn(timeline_detail["disk_consistent_lsn"]) >= expect_lsn + + # We set a short checkpoint timeout: expect things to get frozen+flushed within that + wait_until(checkpoint_interval_secs * 3, 1, assert_all_disk_consistent) + + def assert_all_remote_consistent(): + """ + Assert that all the shards' remote_consistent_lsns have reached expect_lsn + """ + for tenant_shard_id, pageserver in shards: + timeline_detail = pageserver.http_client().timeline_detail(tenant_shard_id, timeline_id) + log.info(f"{tenant_shard_id} (ps {pageserver.id}) detail: {timeline_detail}") + assert Lsn(timeline_detail["remote_consistent_lsn"]) >= expect_lsn + + # We set a short checkpoint timeout: expect things to get frozen+flushed within that + wait_until(checkpoint_interval_secs * 3, 1, assert_all_remote_consistent) + + workload.validate() + + class Failure: pageserver_id: Optional[int] @@ -784,6 +928,8 @@ def test_sharding_split_failures( ".*Reconcile error: receive body: error sending request for url.*", # Node offline cases will fail inside reconciler when detaching secondaries ".*Reconcile error on shard.*: receive body: error sending request for url.*", + # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning + ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*", ] ) @@ -869,6 +1015,7 @@ def test_sharding_split_failures( # Having failed+rolled back, we should be able to split again # No failures this time; it will succeed env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count) + env.storage_controller.reconcile_until_idle(timeout_secs=30) workload.churn_rows(10) workload.validate() @@ -922,6 +1069,10 @@ def test_sharding_split_failures( finish_split() assert_split_done() + # Having completed the split, pump the background reconciles to ensure that + # the scheduler reaches an idle state + env.storage_controller.reconcile_until_idle(timeout_secs=30) + env.storage_controller.consistency_check() diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_storage_controller.py similarity index 80% rename from test_runner/regress/test_sharding_service.py rename to test_runner/regress/test_storage_controller.py index fc6c137667..840f354142 100644 --- a/test_runner/regress/test_sharding_service.py +++ b/test_runner/regress/test_storage_controller.py @@ -1,3 +1,4 @@ +import json import time from collections import defaultdict from datetime import datetime, timezone @@ -24,7 +25,7 @@ from fixtures.pageserver.utils import ( from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.types import TenantId, TenantShardId, TimelineId -from fixtures.utils import run_pg_bench_small, wait_until +from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until from mypy_boto3_s3.type_defs import ( ObjectTypeDef, ) @@ -41,11 +42,11 @@ def get_node_shard_counts(env: NeonEnv, tenant_ids): return counts -def test_sharding_service_smoke( +def test_storage_controller_smoke( neon_env_builder: NeonEnvBuilder, ): """ - Test the basic lifecycle of a sharding service: + Test the basic lifecycle of a storage controller: - Restarting - Restarting a pageserver - Creating and deleting tenants and timelines @@ -203,7 +204,7 @@ def test_node_status_after_restart( env.storage_controller.consistency_check() -def test_sharding_service_passthrough( +def test_storage_controller_passthrough( neon_env_builder: NeonEnvBuilder, ): """ @@ -230,7 +231,7 @@ def test_sharding_service_passthrough( env.storage_controller.consistency_check() -def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_a = env.initial_tenant tenant_b = TenantId.generate() @@ -265,7 +266,7 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("warm_up", [True, False]) -def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool): +def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool): """ We onboard tenants to the sharding service by treating it as a 'virtual pageserver' which provides the /location_config API. This is similar to creating a tenant, @@ -302,7 +303,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: origin_ps.http_client().tenant_create(tenant_id, generation=generation) # As if doing a live migration, first configure origin into stale mode - origin_ps.http_client().tenant_location_conf( + r = origin_ps.http_client().tenant_location_conf( tenant_id, { "mode": "AttachedStale", @@ -311,6 +312,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: "generation": generation, }, ) + assert len(r["shards"]) == 1 if warm_up: origin_ps.http_client().tenant_heatmap_upload(tenant_id) @@ -331,7 +333,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: # Call into storage controller to onboard the tenant generation += 1 - virtual_ps_http.tenant_location_conf( + r = virtual_ps_http.tenant_location_conf( tenant_id, { "mode": "AttachedMulti", @@ -340,6 +342,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: "generation": generation, }, ) + assert len(r["shards"]) == 1 # As if doing a live migration, detach the original pageserver origin_ps.http_client().tenant_location_conf( @@ -356,7 +359,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: # set it to AttachedSingle: this is a no-op, but we test it because the # cloud control plane may call this for symmetry with live migration to # an individual pageserver - virtual_ps_http.tenant_location_conf( + r = virtual_ps_http.tenant_location_conf( tenant_id, { "mode": "AttachedSingle", @@ -365,6 +368,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: "generation": generation, }, ) + assert len(r["shards"]) == 1 # We should see the tenant is now attached to the pageserver managed # by the sharding service @@ -395,7 +399,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: # The generation has moved on since we onboarded assert generation != dest_tenant_before_conf_change["generation"] - virtual_ps_http.tenant_location_conf( + r = virtual_ps_http.tenant_location_conf( tenant_id, { "mode": "AttachedSingle", @@ -405,6 +409,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: "generation": generation, }, ) + assert len(r["shards"]) == 1 dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id) assert ( dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"] @@ -415,7 +420,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: env.storage_controller.consistency_check() -def test_sharding_service_compute_hook( +def test_storage_controller_compute_hook( httpserver: HTTPServer, neon_env_builder: NeonEnvBuilder, httpserver_listen_address, @@ -433,10 +438,13 @@ def test_sharding_service_compute_hook( # Set up fake HTTP notify endpoint notifications = [] + handle_params = {"status": 200} + def handler(request: Request): - log.info(f"Notify request: {request}") + status = handle_params["status"] + log.info(f"Notify request[{status}]: {request}") notifications.append(request.json) - return Response(status=200) + return Response(status=status) httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) @@ -504,10 +512,28 @@ def test_sharding_service_compute_hook( wait_until(10, 1, received_split_notification) + # If the compute hook is unavailable, that should not block creating a tenant and + # creating a timeline. This simulates a control plane refusing to accept notifications + handle_params["status"] = 423 + degraded_tenant_id = TenantId.generate() + degraded_timeline_id = TimelineId.generate() + env.storage_controller.tenant_create(degraded_tenant_id) + env.storage_controller.pageserver_api().timeline_create( + PgVersion.NOT_SET, degraded_tenant_id, degraded_timeline_id + ) + + # Ensure we hit the handler error path + env.storage_controller.allowed_errors.append( + ".*Failed to notify compute of attached pageserver.*tenant busy.*" + ) + env.storage_controller.allowed_errors.append(".*Reconcile error.*tenant busy.*") + assert notifications[-1] is not None + assert notifications[-1]["tenant_id"] == str(degraded_tenant_id) + env.storage_controller.consistency_check() -def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): """ Verify that occasional-use debug APIs work as expected. This is a lightweight test that just hits the endpoints to check that they don't bitrot. @@ -568,7 +594,7 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder): env.storage_controller.consistency_check() -def test_sharding_service_s3_time_travel_recovery( +def test_storage_controller_s3_time_travel_recovery( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, ): @@ -678,7 +704,7 @@ def test_sharding_service_s3_time_travel_recovery( env.storage_controller.consistency_check() -def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_auth(neon_env_builder: NeonEnvBuilder): neon_env_builder.auth_enabled = True env = neon_env_builder.init_start() svc = env.storage_controller @@ -702,13 +728,18 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder): StorageControllerApiException, match="Forbidden: JWT authentication error", ): - svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN)) + svc.request( + "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.SAFEKEEPER_DATA) + ) # Token with correct scope svc.request( "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.PAGE_SERVER_API) ) + # Token with admin scope should also be permitted + svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN)) + # No token with pytest.raises( StorageControllerApiException, @@ -742,7 +773,7 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder): ) -def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder): """ Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without supplying the whole LocationConf. @@ -845,7 +876,7 @@ def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]: PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"), ], ) -def test_sharding_service_heartbeats( +def test_storage_controller_heartbeats( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure ): neon_env_builder.num_pageservers = 2 @@ -955,7 +986,7 @@ def test_sharding_service_heartbeats( wait_until(10, 1, storage_controller_consistent) -def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder): """ Exercise the behavior of the /re-attach endpoint on pageserver startup when pageservers have a mixture of attached and secondary locations @@ -1015,3 +1046,194 @@ def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder): "storage_controller_reconcile_complete_total", filter={"status": "ok"} ) assert reconciles_after_restart == reconciles_before_restart + + +def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder): + """ + Check that emergency hooks for disabling rogue tenants' reconcilers work as expected. + """ + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + + env.storage_controller.allowed_errors.extend( + [ + # We will intentionally cause reconcile errors + ".*Reconcile error.*", + # Message from using a scheduling policy + ".*Scheduling is disabled by policy.*", + ".*Skipping reconcile for policy.*", + # Message from a node being offline + ".*Call to node .* management API .* failed", + ] + ) + + # Stop pageserver so that reconcile cannot complete + env.pageserver.stop() + + env.storage_controller.tenant_create(tenant_id, placement_policy="Detached") + + # Try attaching it: we should see reconciles failing + env.storage_controller.tenant_policy_update( + tenant_id, + { + "placement": {"Attached": 0}, + }, + ) + + def reconcile_errors() -> int: + return int( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "error"} + ) + or 0 + ) + + def reconcile_ok() -> int: + return int( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + or 0 + ) + + def assert_errors_gt(n) -> int: + e = reconcile_errors() + assert e > n + return e + + errs = wait_until(10, 1, lambda: assert_errors_gt(0)) + + # Try reconciling again, it should fail again + with pytest.raises(StorageControllerApiException): + env.storage_controller.reconcile_all() + errs = wait_until(10, 1, lambda: assert_errors_gt(errs)) + + # Configure the tenant to disable reconciles + env.storage_controller.tenant_policy_update( + tenant_id, + { + "scheduling": "Stop", + }, + ) + + # Try reconciling again, it should not cause an error (silently skip) + env.storage_controller.reconcile_all() + assert reconcile_errors() == errs + + # Start the pageserver and re-enable reconciles + env.pageserver.start() + env.storage_controller.tenant_policy_update( + tenant_id, + { + "scheduling": "Active", + }, + ) + + def assert_ok_gt(n) -> int: + o = reconcile_ok() + assert o > n + return o + + # We should see a successful reconciliation + wait_until(10, 1, lambda: assert_ok_gt(0)) + + # And indeed the tenant should be attached + assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 + + +def test_storcon_cli(neon_env_builder: NeonEnvBuilder): + """ + The storage controller command line interface (storcon-cli) is an internal tool. Most tests + just use the APIs directly: this test exercises some basics of the CLI as a regression test + that the client remains usable as the server evolves. + """ + output_dir = neon_env_builder.test_output_dir + shard_count = 4 + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api] + + def storcon_cli(args): + """ + CLI wrapper: returns stdout split into a list of non-empty strings + """ + (output_path, stdout, status_code) = subprocess_capture( + output_dir, + [str(s) for s in base_args + args], + echo_stderr=True, + echo_stdout=True, + env={}, + check=False, + capture_stdout=True, + timeout=10, + ) + if status_code: + log.warning(f"Command {args} failed") + log.warning(f"Output at: {output_path}") + + raise RuntimeError("CLI failure (check logs for stderr)") + + assert stdout is not None + return [line.strip() for line in stdout.split("\n") if line.strip()] + + # List nodes + node_lines = storcon_cli(["nodes"]) + # Table header, footer, and one line of data + assert len(node_lines) == 5 + assert "localhost" in node_lines[3] + + # Pause scheduling onto a node + storcon_cli(["node-configure", "--node-id", "1", "--scheduling", "pause"]) + assert "Pause" in storcon_cli(["nodes"])[3] + + # We will simulate a node death and then marking it offline + env.pageservers[0].stop(immediate=True) + # Sleep to make it unlikely that the controller's heartbeater will race handling + # a /utilization response internally, such that it marks the node back online. IRL + # there would always be a longer delay than this before a node failing and a human + # intervening. + time.sleep(2) + + storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"]) + assert "Offline" in storcon_cli(["nodes"])[3] + + # List tenants + tenant_lines = storcon_cli(["tenants"]) + assert len(tenant_lines) == 5 + assert str(env.initial_tenant) in tenant_lines[3] + + # Setting scheduling policies intentionally result in warnings, they're for rare use. + env.storage_controller.allowed_errors.extend( + [".*Skipping reconcile for policy.*", ".*Scheduling is disabled by policy.*"] + ) + + # Describe a tenant + tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)]) + assert len(tenant_lines) == 3 + shard_count * 2 + assert str(env.initial_tenant) in tenant_lines[3] + + # Pause changes on a tenant + storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"]) + assert "Stop" in storcon_cli(["tenants"])[3] + + # Change a tenant's placement + storcon_cli( + ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"] + ) + assert "Secondary" in storcon_cli(["tenants"])[3] + + # Modify a tenant's config + storcon_cli( + [ + "tenant-config", + "--tenant-id", + str(env.initial_tenant), + "--config", + json.dumps({"pitr_interval": "1m"}), + ] + ) + + # Quiesce any background reconciliation before doing consistency check + env.storage_controller.reconcile_until_idle(timeout_secs=10) + env.storage_controller.consistency_check() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index a164c7f60a..c115c0375b 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -469,7 +469,8 @@ def test_tenant_delete_concurrent( ): """ Validate that concurrent delete requests to the same tenant behave correctly: - exactly one should succeed. + exactly one should execute: the rest should give 202 responses but not start + another deletion. This is a reproducer for https://github.com/neondatabase/neon/issues/5936 """ @@ -484,14 +485,10 @@ def test_tenant_delete_concurrent( run_pg_bench_small(pg_bin, endpoint.connstr()) last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) - CONFLICT_MESSAGE = "Precondition failed: Invalid state Stopping. Expected Active or Broken" - env.pageserver.allowed_errors.extend( [ # lucky race with stopping from flushing a layer we fail to schedule any uploads ".*layer flush task.+: could not flush frozen layer: update_metadata_file", - # Errors logged from our 4xx requests - f".*{CONFLICT_MESSAGE}.*", ] ) @@ -507,7 +504,7 @@ def test_tenant_delete_concurrent( return ps_http.tenant_delete(tenant_id) def hit_remove_failpoint(): - env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}") + return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1] def hit_run_failpoint(): env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}") @@ -518,11 +515,14 @@ def test_tenant_delete_concurrent( # Wait until the first request completes its work and is blocked on removing # the TenantSlot from tenant manager. - wait_until(100, 0.1, hit_remove_failpoint) + log_cursor = wait_until(100, 0.1, hit_remove_failpoint) + assert log_cursor is not None - # Start another request: this should fail when it sees a tenant in Stopping state - with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE): - ps_http.tenant_delete(tenant_id) + # Start another request: this should succeed without actually entering the deletion code + ps_http.tenant_delete(tenant_id) + assert not env.pageserver.log_contains( + f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor + ) # Start another background request, which will pause after acquiring a TenantSlotGuard # but before completing. @@ -539,8 +539,10 @@ def test_tenant_delete_concurrent( # Permit the duplicate background request to run to completion and fail. ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off")) - with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE): - background_4xx_req.result(timeout=10) + background_4xx_req.result(timeout=10) + assert not env.pageserver.log_contains( + f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor + ) # Physical deletion should have happened assert_prefix_empty( diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 025cc930d7..4c8fd4b0e5 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -20,9 +20,10 @@ from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TimelineId -@pytest.mark.xfail -def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): - env = neon_simple_env +def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_configs() + env.start() + (tenant_id, _) = env.neon_cli.create_tenant() http_client = env.pageserver.http_client() initial_size = http_client.tenant_size(tenant_id) @@ -35,66 +36,25 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0] assert branch_name == main_branch_name - with env.endpoints.create_start( + endpoint = env.endpoints.create_start( main_branch_name, tenant_id=tenant_id, config_lines=["autovacuum=off", "checkpoint_timeout=10min"], - ) as endpoint: - with endpoint.cursor() as cur: - cur.execute("SELECT 1") - row = cur.fetchone() - assert row is not None - assert row[0] == 1 - size = http_client.tenant_size(tenant_id) - # we've disabled the autovacuum and checkpoint - # so background processes should not change the size. - # If this test will flake we should probably loosen the check - assert ( - size == initial_size - ), f"starting idle compute should not change the tenant size (Currently {size}, expected {initial_size})" + ) - # the size should be the same, until we increase the size over the - # gc_horizon - size, inputs = http_client.tenant_size_and_modelinputs(tenant_id) - assert ( - size == initial_size - ), f"tenant_size should not be affected by shutdown of compute (Currently {size}, expected {initial_size})" + with endpoint.cursor() as cur: + cur.execute("SELECT 1") + row = cur.fetchone() + assert row is not None + assert row[0] == 1 - expected_inputs = { - "segments": [ - { - "segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True}, - "timeline_id": f"{main_timeline_id}", - "kind": "BranchStart", - }, - { - "segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True}, - "timeline_id": f"{main_timeline_id}", - "kind": "BranchEnd", - }, - ], - "timeline_inputs": [ - { - "timeline_id": f"{main_timeline_id}", - "ancestor_id": None, - "ancestor_lsn": "0/0", - "last_record": "0/1698CC0", - "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/0", - "pitr_cutoff": "0/0", - "next_gc_cutoff": "0/0", - "retention_param_cutoff": None, - } - ], - } - expected_inputs = mask_model_inputs(expected_inputs) - actual_inputs = mask_model_inputs(inputs) + # The transaction above will make the compute generate a checkpoint. + # In turn, the pageserver persists the checkpoint. This should only be + # one key with a size of a couple hundred bytes. + wait_for_last_flush_lsn(env, endpoint, tenant_id, main_timeline_id) + size = http_client.tenant_size(tenant_id) - assert expected_inputs == actual_inputs - - size_debug_file = open(test_output_dir / "size_debug.html", "w") - size_debug = http_client.tenant_size_debug(tenant_id) - size_debug_file.write(size_debug) + assert size >= initial_size and size - initial_size < 1024 def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: Path): @@ -190,7 +150,6 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_ou size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = 15 @@ -233,7 +192,6 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = 5 @@ -282,7 +240,6 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = small diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 2cac58dc1a..ac1a747df3 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -103,9 +103,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): n_timelines = 3 - branch_names = [ - "test_safekeepers_many_timelines_{}".format(tlin) for tlin in range(n_timelines) - ] + branch_names = [f"test_safekeepers_many_timelines_{tlin}" for tlin in range(n_timelines)] # pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418') # that's not really human readable, so the branch names are introduced in Neon CLI. # Neon CLI stores its branch <-> timeline mapping in its internals, @@ -1136,13 +1134,13 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline for f in mismatch: f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f) f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f) - stdout_filename = "{}.filediff".format(f2) + stdout_filename = f"{f2}.filediff" with open(stdout_filename, "w") as stdout_f: - subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True) - subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True) + subprocess.run(f"xxd {f1} > {f1}.hex ", shell=True) + subprocess.run(f"xxd {f2} > {f2}.hex ", shell=True) - cmd = "diff {}.hex {}.hex".format(f1, f2) + cmd = f"diff {f1}.hex {f2}.hex" subprocess.run([cmd], stdout=stdout_f, shell=True) assert (mismatch, not_regular) == ( diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 720633189e..5902eb3217 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -10,6 +10,7 @@ import pytest import toml from fixtures.log_helper import getLogger from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper +from fixtures.remote_storage import RemoteStorageKind from fixtures.types import Lsn, TenantId, TimelineId log = getLogger("root.safekeeper_async") @@ -76,20 +77,20 @@ class WorkerStats(object): self.counters[worker_id] += 1 def check_progress(self): - log.debug("Workers progress: {}".format(self.counters)) + log.debug(f"Workers progress: {self.counters}") # every worker should finish at least one tx assert all(cnt > 0 for cnt in self.counters) progress = sum(self.counters) - log.info("All workers made {} transactions".format(progress)) + log.info(f"All workers made {progress} transactions") async def run_random_worker( stats: WorkerStats, endpoint: Endpoint, worker_id, n_accounts, max_transfer ): pg_conn = await endpoint.connect_async() - log.debug("Started worker {}".format(worker_id)) + log.debug(f"Started worker {worker_id}") while stats.running: from_uid = random.randint(0, n_accounts - 1) @@ -99,9 +100,9 @@ async def run_random_worker( await bank_transfer(pg_conn, from_uid, to_uid, amount) stats.inc_progress(worker_id) - log.debug("Executed transfer({}) {} => {}".format(amount, from_uid, to_uid)) + log.debug(f"Executed transfer({amount}) {from_uid} => {to_uid}") - log.debug("Finished worker {}".format(worker_id)) + log.debug(f"Finished worker {worker_id}") await pg_conn.close() @@ -199,7 +200,9 @@ async def run_restarts_under_load( # assert that at least one transaction has completed in every worker stats.check_progress() - victim.start() + # testing #6530, temporary here + # TODO: remove afer partial backup is enabled by default + victim.start(extra_opts=["--partial-backup-enabled", "--partial-backup-timeout=2s"]) log.info("Iterations are finished, exiting coroutines...") stats.running = False @@ -213,6 +216,7 @@ async def run_restarts_under_load( # Restart acceptors one by one, while executing and validating bank transactions def test_restarts_under_load(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() env.neon_cli.create_branch("test_safekeepers_restarts_under_load") diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 748643b468..d9149dc59a 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 748643b4683e9fe3b105011a6ba8a687d032cd65 +Subproject commit d9149dc59abcbeeb26293707509aef51752db28f diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index e7651e79c0..85d809c124 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit e7651e79c0c27fbddc3c724f5b9553222c28e395 +Subproject commit 85d809c124a898847a97d66a211f7d5ef4f8e0cb diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 3946b2e2ea..261497dd63 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 3946b2e2ea71d07af092099cb5bcae76a69b90d6 +Subproject commit 261497dd63ace434045058b1453bcbaaa83f23e5 diff --git a/vendor/revisions.json b/vendor/revisions.json index 3c1b866137..dfc0aa04c3 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "3946b2e2ea71d07af092099cb5bcae76a69b90d6", - "postgres-v15": "e7651e79c0c27fbddc3c724f5b9553222c28e395", - "postgres-v14": "748643b4683e9fe3b105011a6ba8a687d032cd65" + "postgres-v16": "261497dd63ace434045058b1453bcbaaa83f23e5", + "postgres-v15": "85d809c124a898847a97d66a211f7d5ef4f8e0cb", + "postgres-v14": "d9149dc59abcbeeb26293707509aef51752db28f" } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 5b93088303..c760744491 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -187,6 +187,14 @@ files: query: | select sum(pg_database_size(datname)) as total from pg_database; + - metric_name: lfc_approximate_working_set_size + type: gauge + help: 'Approximate working set size in pages of 8192 bytes' + key_labels: + values: [approximate_working_set_size] + query: | + select neon.approximate_working_set_size(false) as approximate_working_set_size; + build: | # Build cgroup-tools # diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 7b8228a082..d6e2cc2996 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -37,8 +37,7 @@ futures-io = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } -hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] } -hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] } +hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } @@ -64,7 +63,7 @@ scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } sha2 = { version = "0.10", features = ["asm"] } -smallvec = { version = "1", default-features = false, features = ["write"] } +smallvec = { version = "1", default-features = false, features = ["const_new", "write"] } subtle = { version = "2" } time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } @@ -76,7 +75,6 @@ tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } -tungstenite = { version = "0.20" } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } zeroize = { version = "1", features = ["derive"] } @@ -91,7 +89,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } -hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] } +hashbrown = { version = "0.14", features = ["raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits", "use_std"] }