diff --git a/.github/file-filters.yaml b/.github/file-filters.yaml new file mode 100644 index 0000000000..886cd3919a --- /dev/null +++ b/.github/file-filters.yaml @@ -0,0 +1,12 @@ +rust_code: ['**/*.rs', '**/Cargo.toml', '**/Cargo.lock'] + +v14: ['vendor/postgres-v14/**', 'Makefile', 'pgxn/**'] +v15: ['vendor/postgres-v15/**', 'Makefile', 'pgxn/**'] +v16: ['vendor/postgres-v16/**', 'Makefile', 'pgxn/**'] +v17: ['vendor/postgres-v17/**', 'Makefile', 'pgxn/**'] + +rebuild_neon_extra: + - .github/workflows/neon_extra_builds.yml + +rebuild_macos: + - .github/workflows/build-macos.yml diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml new file mode 100644 index 0000000000..01d82a1ed2 --- /dev/null +++ b/.github/workflows/build-macos.yml @@ -0,0 +1,241 @@ +name: Check neon with MacOS builds + +on: + workflow_call: + inputs: + pg_versions: + description: "Array of the pg versions to build for, for example: ['v14', 'v17']" + type: string + default: '[]' + required: false + rebuild_rust_code: + description: "Rebuild Rust code" + type: boolean + default: false + required: false + rebuild_everything: + description: "If true, rebuild for all versions" + type: boolean + default: false + required: false + +env: + RUST_BACKTRACE: 1 + COPT: '-Werror' + +# TODO: move `check-*` and `files-changed` jobs to the "Caller" Workflow +# We should care about that as Github has limitations: +# - You can connect up to four levels of workflows +# - You can call a maximum of 20 unique reusable workflows from a single workflow file. +# https://docs.github.com/en/actions/sharing-automations/reusing-workflows#limitations +jobs: + build-pgxn: + if: | + (inputs.pg_versions != '[]' || inputs.rebuild_everything) && ( + contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || + github.ref_name == 'main' + ) + timeout-minutes: 30 + runs-on: macos-15 + strategy: + matrix: + postgres-version: ${{ inputs.rebuild_everything && fromJson('["v14", "v15", "v16", "v17"]') || fromJSON(inputs.pg_versions) }} + env: + # Use release build only, to have less debug info around + # Hence keeping target/ (and general cache size) smaller + BUILD_TYPE: release + steps: + - name: Checkout main repo + uses: actions/checkout@v4 + + - name: Set pg ${{ matrix.postgres-version }} for caching + id: pg_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-${{ matrix.postgres-version }}) | tee -a "${GITHUB_OUTPUT}" + + - name: Cache postgres ${{ matrix.postgres-version }} build + id: cache_pg + uses: actions/cache@v4 + with: + path: pg_install/${{ matrix.postgres-version }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ matrix.postgres-version }}-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Checkout submodule vendor/postgres-${{ matrix.postgres-version }} + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + git submodule init vendor/postgres-${{ matrix.postgres-version }} + git submodule update --depth 1 --recursive + + - name: Install build dependencies + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + brew install flex bison openssl protobuf icu4c + + - name: Set extra env for macOS + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV + echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV + + - name: Build Postgres ${{ matrix.postgres-version }} + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + make postgres-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu) + + - name: Build Neon Pg Ext ${{ matrix.postgres-version }} + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu) + + - name: Get postgres headers ${{ matrix.postgres-version }} + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + make postgres-headers-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu) + + build-walproposer-lib: + if: | + (inputs.pg_versions != '[]' || inputs.rebuild_everything) && ( + contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || + github.ref_name == 'main' + ) + timeout-minutes: 30 + runs-on: macos-15 + needs: [build-pgxn] + env: + # Use release build only, to have less debug info around + # Hence keeping target/ (and general cache size) smaller + BUILD_TYPE: release + steps: + - name: Checkout main repo + uses: actions/checkout@v4 + + - name: Set pg v17 for caching + id: pg_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}" + + - name: Cache postgres v17 build + id: cache_pg + uses: actions/cache@v4 + with: + path: pg_install/v17 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Cache walproposer-lib + id: cache_walproposer_lib + uses: actions/cache@v4 + with: + path: pg_install/build/walproposer-lib + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Checkout submodule vendor/postgres-v17 + if: steps.cache_walproposer_lib.outputs.cache-hit != 'true' + run: | + git submodule init vendor/postgres-v17 + git submodule update --depth 1 --recursive + + - name: Install build dependencies + if: steps.cache_walproposer_lib.outputs.cache-hit != 'true' + run: | + brew install flex bison openssl protobuf icu4c + + - name: Set extra env for macOS + if: steps.cache_walproposer_lib.outputs.cache-hit != 'true' + run: | + echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV + echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV + + - name: Build walproposer-lib (only for v17) + if: steps.cache_walproposer_lib.outputs.cache-hit != 'true' + run: + make walproposer-lib -j$(sysctl -n hw.ncpu) + + cargo-build: + if: | + (inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything) && ( + contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || + github.ref_name == 'main' + ) + timeout-minutes: 30 + runs-on: macos-15 + needs: [build-pgxn, build-walproposer-lib] + env: + # Use release build only, to have less debug info around + # Hence keeping target/ (and general cache size) smaller + BUILD_TYPE: release + steps: + - name: Checkout main repo + uses: actions/checkout@v4 + with: + submodules: true + + - name: Set pg v14 for caching + id: pg_rev_v14 + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) | tee -a "${GITHUB_OUTPUT}" + - name: Set pg v15 for caching + id: pg_rev_v15 + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) | tee -a "${GITHUB_OUTPUT}" + - name: Set pg v16 for caching + id: pg_rev_v16 + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) | tee -a "${GITHUB_OUTPUT}" + - name: Set pg v17 for caching + id: pg_rev_v17 + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}" + + - name: Cache postgres v14 build + id: cache_pg + uses: actions/cache@v4 + with: + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v14-${{ steps.pg_rev_v14.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v15 build + id: cache_pg_v15 + uses: actions/cache@v4 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v15-${{ steps.pg_rev_v15.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v16 build + id: cache_pg_v16 + uses: actions/cache@v4 + with: + path: pg_install/v16 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v16-${{ steps.pg_rev_v16.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v17 build + id: cache_pg_v17 + uses: actions/cache@v4 + with: + path: pg_install/v17 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Cache cargo deps (only for v17) + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + !~/.cargo/registry/src + ~/.cargo/git + target + key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust + + - name: Cache walproposer-lib + id: cache_walproposer_lib + uses: actions/cache@v4 + with: + path: pg_install/build/walproposer-lib + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Install build dependencies + run: | + brew install flex bison openssl protobuf icu4c + + - name: Set extra env for macOS + run: | + echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV + echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV + + - name: Run cargo build (only for v17) + run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu) + + - name: Check that no warnings are produced (only for v17) + run: ./run_clippy.sh diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 1f85c2e102..5b5910badf 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -31,19 +31,15 @@ jobs: uses: ./.github/workflows/build-build-tools-image.yml secrets: inherit - check-macos-build: - needs: [ check-permissions ] - if: | - contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || - contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || - github.ref_name == 'main' - timeout-minutes: 90 - runs-on: macos-15 - - env: - # Use release build only, to have less debug info around - # Hence keeping target/ (and general cache size) smaller - BUILD_TYPE: release + files-changed: + name: Detect what files changed + runs-on: ubuntu-22.04 + timeout-minutes: 3 + outputs: + v17: ${{ steps.files_changed.outputs.v17 }} + postgres_changes: ${{ steps.postgres_changes.outputs.changes }} + rebuild_rust_code: ${{ steps.files_changed.outputs.rust_code }} + rebuild_everything: ${{ steps.files_changed.outputs.rebuild_neon_extra || steps.files_changed.outputs.rebuild_macos }} steps: - name: Checkout @@ -51,106 +47,45 @@ jobs: with: submodules: true - - name: Install macOS postgres dependencies - run: brew install flex bison openssl protobuf icu4c - - - name: Set pg 14 revision for caching - id: pg_v14_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT - - - name: Set pg 15 revision for caching - id: pg_v15_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT - - - name: Set pg 16 revision for caching - id: pg_v16_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT - - - name: Set pg 17 revision for caching - id: pg_v17_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT - - - name: Cache postgres v14 build - id: cache_pg_14 - uses: actions/cache@v4 + - name: Check for Postgres changes + uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3 + id: files_changed with: - path: pg_install/v14 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + token: ${{ github.token }} + filters: .github/file-filters.yaml + base: ${{ github.event_name != 'pull_request' && (github.event.merge_group.base_ref || github.ref_name) || '' }} + ref: ${{ github.event_name != 'pull_request' && (github.event.merge_group.head_ref || github.ref) || '' }} - - name: Cache postgres v15 build - id: cache_pg_15 - uses: actions/cache@v4 - with: - path: pg_install/v15 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v16 build - id: cache_pg_16 - uses: actions/cache@v4 - with: - path: pg_install/v16 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v17 build - id: cache_pg_17 - uses: actions/cache@v4 - with: - path: pg_install/v17 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Set extra env for macOS + - name: Filter out only v-string for build matrix + id: postgres_changes run: | - echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV - echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV + v_strings_only_as_json_array=$(echo ${{ steps.files_changed.outputs.chnages }} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c) + echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}" - - name: Cache cargo deps - uses: actions/cache@v4 - with: - path: | - ~/.cargo/registry - !~/.cargo/registry/src - ~/.cargo/git - target - key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust - - - name: Build postgres v14 - if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: make postgres-v14 -j$(sysctl -n hw.ncpu) - - - name: Build postgres v15 - if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: make postgres-v15 -j$(sysctl -n hw.ncpu) - - - name: Build postgres v16 - if: steps.cache_pg_16.outputs.cache-hit != 'true' - run: make postgres-v16 -j$(sysctl -n hw.ncpu) - - - name: Build postgres v17 - if: steps.cache_pg_17.outputs.cache-hit != 'true' - run: make postgres-v17 -j$(sysctl -n hw.ncpu) - - - name: Build neon extensions - run: make neon-pg-ext -j$(sysctl -n hw.ncpu) - - - name: Build walproposer-lib - run: make walproposer-lib -j$(sysctl -n hw.ncpu) - - - name: Run cargo build - run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release - - - name: Check that no warnings are produced - run: ./run_clippy.sh + check-macos-build: + needs: [ check-permissions, files-changed ] + if: | + contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || + github.ref_name == 'main' + uses: ./.github/workflows/build-macos.yml + with: + pg_versions: ${{ needs.files-changed.outputs.postgres_changes }} + rebuild_rust_code: ${{ needs.files-changed.outputs.rebuild_rust_code }} + rebuild_everything: ${{ fromJson(needs.files-changed.outputs.rebuild_everything) }} gather-rust-build-stats: - needs: [ check-permissions, build-build-tools-image ] + needs: [ check-permissions, build-build-tools-image, files-changed ] permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write if: | - contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') || - contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || - github.ref_name == 'main' + (needs.files-changed.outputs.v17 == 'true' || needs.files-changed.outputs.rebuild_everything == 'true') && ( + contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || + github.ref_name == 'main' + ) runs-on: [ self-hosted, large ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm diff --git a/Cargo.lock b/Cargo.lock index 9e0e343996..f727741883 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -718,13 +718,13 @@ dependencies = [ [[package]] name = "axum" -version = "0.7.5" +version = "0.7.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" dependencies = [ "async-trait", "axum-core", - "base64 0.21.1", + "base64 0.22.1", "bytes", "futures-util", "http 1.1.0", @@ -746,8 +746,8 @@ dependencies = [ "sha1", "sync_wrapper 1.0.1", "tokio", - "tokio-tungstenite", - "tower", + "tokio-tungstenite 0.24.0", + "tower 0.5.2", "tower-layer", "tower-service", "tracing", @@ -1267,6 +1267,7 @@ dependencies = [ "aws-config", "aws-sdk-kms", "aws-sdk-s3", + "axum", "base64 0.13.1", "bytes", "camino", @@ -1277,7 +1278,7 @@ dependencies = [ "fail", "flate2", "futures", - "hyper 0.14.30", + "http 1.1.0", "metrics", "nix 0.27.1", "notify", @@ -1303,6 +1304,8 @@ dependencies = [ "tokio-postgres", "tokio-stream", "tokio-util", + "tower 0.5.2", + "tower-http", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -1650,6 +1653,20 @@ dependencies = [ "parking_lot_core 0.9.8", ] +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core 0.9.8", +] + [[package]] name = "data-encoding" version = "2.4.0" @@ -1949,6 +1966,15 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "env_filter" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" +dependencies = [ + "log", +] + [[package]] name = "env_logger" version = "0.10.2" @@ -1962,6 +1988,16 @@ dependencies = [ "termcolor", ] +[[package]] +name = "env_logger" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d" +dependencies = [ + "env_filter", + "log", +] + [[package]] name = "equator" version = "0.2.2" @@ -2720,7 +2756,7 @@ dependencies = [ "pin-project-lite", "socket2", "tokio", - "tower", + "tower 0.4.13", "tower-service", "tracing", ] @@ -2945,6 +2981,28 @@ dependencies = [ "str_stack", ] +[[package]] +name = "inferno" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75a5d75fee4d36809e6b021e4b96b686e763d365ffdb03af2bd00786353f84fe" +dependencies = [ + "ahash", + "clap", + "crossbeam-channel", + "crossbeam-utils", + "dashmap 6.1.0", + "env_logger 0.11.2", + "indexmap 2.0.1", + "itoa", + "log", + "num-format", + "once_cell", + "quick-xml 0.37.1", + "rgb", + "str_stack", +] + [[package]] name = "inotify" version = "0.9.6" @@ -3152,7 +3210,7 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2" dependencies = [ - "dashmap", + "dashmap 5.5.0", "hashbrown 0.13.2", ] @@ -3260,9 +3318,9 @@ checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40" [[package]] name = "matchit" -version = "0.8.2" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" [[package]] name = "md-5" @@ -3690,23 +3748,23 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "opentelemetry" -version = "0.26.0" +version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17" +checksum = "ab70038c28ed37b97d8ed414b6429d343a8bbf44c9f79ec854f3a643029ba6d7" dependencies = [ "futures-core", "futures-sink", "js-sys", - "once_cell", "pin-project-lite", "thiserror", + "tracing", ] [[package]] name = "opentelemetry-http" -version = "0.26.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99" +checksum = "10a8a7f5f6ba7c1b286c2fbca0454eaba116f63bbe69ed250b642d36fbb04d80" dependencies = [ "async-trait", "bytes", @@ -3717,9 +3775,9 @@ dependencies = [ [[package]] name = "opentelemetry-otlp" -version = "0.26.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd" +checksum = "91cf61a1868dacc576bf2b2a1c3e9ab150af7272909e80085c3173384fe11f76" dependencies = [ "async-trait", "futures-core", @@ -3735,9 +3793,9 @@ dependencies = [ [[package]] name = "opentelemetry-proto" -version = "0.26.1" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34" +checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6" dependencies = [ "opentelemetry", "opentelemetry_sdk", @@ -3747,22 +3805,21 @@ dependencies = [ [[package]] name = "opentelemetry-semantic-conventions" -version = "0.26.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09" +checksum = "bc1b6902ff63b32ef6c489e8048c5e253e2e4a803ea3ea7e783914536eb15c52" [[package]] name = "opentelemetry_sdk" -version = "0.26.0" +version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3" +checksum = "231e9d6ceef9b0b2546ddf52335785ce41252bc7474ee8ba05bfad277be13ab8" dependencies = [ "async-trait", "futures-channel", "futures-executor", "futures-util", "glob", - "once_cell", "opentelemetry", "percent-encoding", "rand 0.8.5", @@ -3770,6 +3827,7 @@ dependencies = [ "thiserror", "tokio", "tokio-stream", + "tracing", ] [[package]] @@ -4418,7 +4476,7 @@ dependencies = [ "bytes", "crc32c", "criterion", - "env_logger", + "env_logger 0.10.2", "log", "memoffset 0.9.0", "once_cell", @@ -4459,7 +4517,7 @@ dependencies = [ "cfg-if", "criterion", "findshlibs", - "inferno", + "inferno 0.11.21", "libc", "log", "nix 0.26.4", @@ -4685,9 +4743,9 @@ dependencies = [ "clap", "compute_api", "consumption_metrics", - "dashmap", + "dashmap 5.5.0", "ecdsa 0.16.9", - "env_logger", + "env_logger 0.10.2", "fallible-iterator", "flate2", "framed-websockets", @@ -4758,7 +4816,7 @@ dependencies = [ "tokio-postgres", "tokio-postgres2", "tokio-rustls 0.26.0", - "tokio-tungstenite", + "tokio-tungstenite 0.21.0", "tokio-util", "tracing", "tracing-subscriber", @@ -4794,6 +4852,15 @@ dependencies = [ "serde", ] +[[package]] +name = "quick-xml" +version = "0.37.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f22f29bdff3987b4d8632ef95fd6424ec7e4e0a57e2f4fc63e489e75357f6a03" +dependencies = [ + "memchr", +] + [[package]] name = "quote" version = "1.0.37" @@ -5178,15 +5245,15 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2" +checksum = "73e6153390585f6961341b50e5a1931d6be6dee4292283635903c26ef9d980d2" dependencies = [ "anyhow", "async-trait", "getrandom 0.2.11", "http 1.1.0", - "matchit 0.8.2", + "matchit 0.8.4", "opentelemetry", "reqwest", "reqwest-middleware", @@ -6800,7 +6867,19 @@ dependencies = [ "futures-util", "log", "tokio", - "tungstenite", + "tungstenite 0.21.0", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9" +dependencies = [ + "futures-util", + "log", + "tokio", + "tungstenite 0.24.0", ] [[package]] @@ -6881,7 +6960,7 @@ dependencies = [ "tokio", "tokio-rustls 0.26.0", "tokio-stream", - "tower", + "tower 0.4.13", "tower-layer", "tower-service", "tracing", @@ -6922,16 +7001,49 @@ dependencies = [ ] [[package]] -name = "tower-layer" -version = "0.3.2" +name = "tower" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper 1.0.1", + "tokio", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-http" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697" +dependencies = [ + "bitflags 2.4.1", + "bytes", + "http 1.1.0", + "http-body 1.0.0", + "pin-project-lite", + "tower-layer", + "tower-service", + "tracing", + "uuid", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" [[package]] name = "tower-service" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" @@ -7000,9 +7112,9 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.27.0" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b" +checksum = "97a971f6058498b5c0f1affa23e7ea202057a7301dbff68e968b2d578bcbd053" dependencies = [ "js-sys", "once_cell", @@ -7086,6 +7198,24 @@ dependencies = [ "utf-8", ] +[[package]] +name = "tungstenite" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18e5b8366ee7a95b16d32197d0b2604b43a0be89dc5fac9f8e96ccafbaedda8a" +dependencies = [ + "byteorder", + "bytes", + "data-encoding", + "http 1.1.0", + "httparse", + "log", + "rand 0.8.5", + "sha1", + "thiserror", + "utf-8", +] + [[package]] name = "twox-hash" version = "1.6.3" @@ -7253,6 +7383,7 @@ dependencies = [ "hex-literal", "humantime", "hyper 0.14.30", + "inferno 0.12.0", "itertools 0.10.5", "jemalloc_pprof", "jsonwebtoken", @@ -7356,7 +7487,7 @@ dependencies = [ "anyhow", "camino-tempfile", "clap", - "env_logger", + "env_logger 0.10.2", "log", "postgres", "postgres_ffi", @@ -7867,7 +7998,8 @@ dependencies = [ "tokio-util", "toml_edit", "tonic", - "tower", + "tower 0.4.13", + "tower 0.5.2", "tracing", "tracing-core", "url", diff --git a/Cargo.toml b/Cargo.toml index 197808d5ae..a4e601bb58 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -65,7 +65,7 @@ aws-smithy-types = "1.2" aws-credential-types = "1.2.0" aws-sigv4 = { version = "1.2", features = ["sign-http"] } aws-types = "1.3" -axum = { version = "0.7.5", features = ["ws"] } +axum = { version = "0.7.9", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" bindgen = "0.70" @@ -110,6 +110,7 @@ hyper-util = "0.1" tokio-tungstenite = "0.21.0" indexmap = "2" indoc = "2" +inferno = "0.12.0" ipnet = "2.10.0" itertools = "0.10" itoa = "1.0.11" @@ -126,10 +127,10 @@ notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" -opentelemetry = "0.26" -opentelemetry_sdk = "0.26" -opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } -opentelemetry-semantic-conventions = "0.26" +opentelemetry = "0.27" +opentelemetry_sdk = "0.27" +opentelemetry-otlp = { version = "0.27", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions = "0.27" parking_lot = "0.12" parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" @@ -143,7 +144,7 @@ rand = "0.8" redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } -reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] } +reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] } reqwest-middleware = "0.4" reqwest-retry = "0.7" routerify = "3" @@ -187,10 +188,12 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] } toml = "0.8" toml_edit = "0.22" tonic = {version = "0.12.3", features = ["tls", "tls-roots"]} -tower-service = "0.3.2" +tower = { version = "0.5.2", default-features = false } +tower-http = { version = "0.6.2", features = ["request-id", "trace"] } +tower-service = "0.3.3" tracing = "0.1" tracing-error = "0.2" -tracing-opentelemetry = "0.27" +tracing-opentelemetry = "0.28" tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } try-lock = "0.2.5" twox-hash = { version = "1.6.3", default-features = false } diff --git a/Dockerfile b/Dockerfile index 2c157b3b2a..d3659f917a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -103,11 +103,6 @@ RUN mkdir -p /data/.neon/ && \ > /data/.neon/pageserver.toml && \ chown -R neon:neon /data/.neon -# When running a binary that links with libpq, default to using our most recent postgres version. Binaries -# that want a particular postgres version will select it explicitly: this is just a default. -ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib - - VOLUME ["/data"] USER neon EXPOSE 6400 diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index fa84e467ad..79210a2e1b 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -258,7 +258,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.83.0 +ENV RUSTC_VERSION=1.84.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 06aaf9e7f4..303daec240 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1167,22 +1167,13 @@ FROM rust-extensions-build AS pg-mooncake-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -# The topmost commit in the `neon` branch at the time of writing this -# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/ -# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af -ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in \ - 'v14') \ - echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \ - esac && \ - git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \ - cd pg_mooncake-src && \ - git checkout "${PG_MOONCAKE_VERSION}" && \ - git submodule update --init --depth 1 --recursive && \ - make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \ - make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \ +RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \ + echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \ + mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \ + make release -j $(getconf _NPROCESSORS_ONLN) && \ + make install -j $(getconf _NPROCESSORS_ONLN) && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control ######################################################################################### diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 9525b27818..33892813c4 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -15,6 +15,7 @@ aws-config.workspace = true aws-sdk-s3.workspace = true aws-sdk-kms.workspace = true anyhow.workspace = true +axum = { workspace = true, features = [] } camino.workspace = true chrono.workspace = true cfg-if.workspace = true @@ -22,7 +23,7 @@ clap.workspace = true fail.workspace = true flate2.workspace = true futures.workspace = true -hyper0 = { workspace = true, features = ["full"] } +http.workspace = true metrics.workspace = true nix.workspace = true notify.workspace = true @@ -37,6 +38,8 @@ serde_with.workspace = true serde_json.workspace = true signal-hook.workspace = true tar.workspace = true +tower.workspace = true +tower-http.workspace = true reqwest = { workspace = true, features = ["json"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 6ede5fdceb..b98cf706d3 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -60,7 +60,7 @@ use compute_tools::compute::{ }; use compute_tools::configurator::launch_configurator; use compute_tools::extension_server::get_pg_version_string; -use compute_tools::http::api::launch_http_server; +use compute_tools::http::launch_http_server; use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; @@ -111,11 +111,6 @@ fn main() -> Result<()> { fn init() -> Result<(String, clap::ArgMatches)> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; - opentelemetry::global::set_error_handler(|err| { - tracing::info!("OpenTelemetry error: {err}"); - }) - .expect("global error handler lock poisoned"); - let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; thread::spawn(move || { for sig in signals.forever() { @@ -493,7 +488,10 @@ fn start_postgres( let mut pg = None; if !prestartup_failed { pg = match compute.start_compute() { - Ok(pg) => Some(pg), + Ok(pg) => { + info!(postmaster_pid = %pg.0.id(), "Postgres was started"); + Some(pg) + } Err(err) => { error!("could not start the compute node: {:#}", err); compute.set_failed_status(err); @@ -591,6 +589,8 @@ fn wait_postgres(pg: Option) -> Result { // propagate to Postgres and it will be shut down as well. let mut exit_code = None; if let Some((mut pg, logs_handle)) = pg { + info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit"); + let ecode = pg .wait() .expect("failed to start waiting on Postgres process"); diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 72198a9479..4a297cfacf 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -36,11 +36,11 @@ pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result Result { let mut conf = conf.clone(); @@ -943,6 +945,78 @@ impl ComputeNode { dbs: databases, })); + // Apply special pre drop database phase. + // NOTE: we use the code of RunInEachDatabase phase for parallelism + // and connection management, but we don't really run it in *each* database, + // only in databases, we're about to drop. + info!("Applying PerDatabase (pre-dropdb) phase"); + let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); + + // Run the phase for each database that we're about to drop. + let db_processes = spec + .delta_operations + .iter() + .flatten() + .filter_map(move |op| { + if op.action.as_str() == "delete_db" { + Some(op.name.clone()) + } else { + None + } + }) + .map(|dbname| { + let spec = spec.clone(); + let ctx = ctx.clone(); + let jwks_roles = jwks_roles.clone(); + let mut conf = conf.as_ref().clone(); + let concurrency_token = concurrency_token.clone(); + // We only need dbname field for this phase, so set other fields to dummy values + let db = DB::UserDB(Database { + name: dbname.clone(), + owner: "cloud_admin".to_string(), + options: None, + restrict_conn: false, + invalid: false, + }); + + debug!("Applying per-database phases for Database {:?}", &db); + + match &db { + DB::SystemDB => {} + DB::UserDB(db) => { + conf.dbname(db.name.as_str()); + } + } + + let conf = Arc::new(conf); + let fut = Self::apply_spec_sql_db( + spec.clone(), + conf, + ctx.clone(), + jwks_roles.clone(), + concurrency_token.clone(), + db, + [DropSubscriptionsForDeletedDatabases].to_vec(), + ); + + Ok(spawn(fut)) + }) + .collect::>>(); + + for process in db_processes.into_iter() { + let handle = process?; + if let Err(e) = handle.await? { + // Handle the error case where the database does not exist + // We do not check whether the DB exists or not in the deletion phase, + // so we shouldn't be strict about it in pre-deletion cleanup as well. + if e.to_string().contains("does not exist") { + warn!("Error dropping subscription: {}", e); + } else { + return Err(e); + } + }; + } + for phase in [ CreateSuperUser, DropInvalidDatabases, @@ -962,7 +1036,7 @@ impl ComputeNode { .await?; } - info!("Applying RunInEachDatabase phase"); + info!("Applying RunInEachDatabase2 phase"); let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); let db_processes = spec @@ -997,6 +1071,12 @@ impl ComputeNode { jwks_roles.clone(), concurrency_token.clone(), db, + [ + DeleteDBRoleReferences, + ChangeSchemaPerms, + HandleAnonExtension, + ] + .to_vec(), ); Ok(spawn(fut)) @@ -1043,16 +1123,13 @@ impl ComputeNode { jwks_roles: Arc>, concurrency_token: Arc, db: DB, + subphases: Vec, ) -> Result<()> { let _permit = concurrency_token.acquire().await?; let mut client_conn = None; - for subphase in [ - DeleteDBRoleReferences, - ChangeSchemaPerms, - HandleAnonExtension, - ] { + for subphase in subphases { apply_operations( spec.clone(), ctx.clone(), diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs deleted file mode 100644 index a4b1a63e6d..0000000000 --- a/compute_tools/src/http/api.rs +++ /dev/null @@ -1,606 +0,0 @@ -use std::convert::Infallible; -use std::net::IpAddr; -use std::net::Ipv6Addr; -use std::net::SocketAddr; -use std::sync::Arc; -use std::thread; - -use crate::catalog::SchemaDumpError; -use crate::catalog::{get_database_schema, get_dbs_and_roles}; -use crate::compute::forward_termination_signal; -use crate::compute::{ComputeNode, ComputeState, ParsedSpec}; -use crate::installed_extensions; -use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest}; -use compute_api::responses::{ - ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError, - SetRoleGrantsResponse, -}; - -use anyhow::Result; -use hyper::header::CONTENT_TYPE; -use hyper::service::{make_service_fn, service_fn}; -use hyper::{Body, Method, Request, Response, Server, StatusCode}; -use metrics::proto::MetricFamily; -use metrics::Encoder; -use metrics::TextEncoder; -use tokio::task; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, warn}; -use tracing_utils::http::OtelName; -use utils::failpoint_support::failpoints_handler; -use utils::http::error::ApiError; -use utils::http::request::must_get_query_param; - -fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { - ComputeStatusResponse { - start_time: state.start_time, - tenant: state - .pspec - .as_ref() - .map(|pspec| pspec.tenant_id.to_string()), - timeline: state - .pspec - .as_ref() - .map(|pspec| pspec.timeline_id.to_string()), - status: state.status, - last_active: state.last_active, - error: state.error.clone(), - } -} - -// Service function to handle all available routes. -async fn routes(req: Request, compute: &Arc) -> Response { - // - // NOTE: The URI path is currently included in traces. That's OK because - // it doesn't contain any variable parts or sensitive information. But - // please keep that in mind if you change the routing here. - // - match (req.method(), req.uri().path()) { - // Serialized compute state. - (&Method::GET, "/status") => { - debug!("serving /status GET request"); - let state = compute.state.lock().unwrap(); - let status_response = status_response_from_state(&state); - Response::new(Body::from(serde_json::to_string(&status_response).unwrap())) - } - - // Startup metrics in JSON format. Keep /metrics reserved for a possible - // future use for Prometheus metrics format. - (&Method::GET, "/metrics.json") => { - info!("serving /metrics.json GET request"); - let metrics = compute.state.lock().unwrap().metrics.clone(); - Response::new(Body::from(serde_json::to_string(&metrics).unwrap())) - } - - // Prometheus metrics - (&Method::GET, "/metrics") => { - debug!("serving /metrics GET request"); - - // When we call TextEncoder::encode() below, it will immediately - // return an error if a metric family has no metrics, so we need to - // preemptively filter out metric families with no metrics. - let metrics = installed_extensions::collect() - .into_iter() - .filter(|m| !m.get_metric().is_empty()) - .collect::>(); - - let encoder = TextEncoder::new(); - let mut buffer = vec![]; - - if let Err(err) = encoder.encode(&metrics, &mut buffer) { - let msg = format!("error handling /metrics request: {err}"); - error!(msg); - return render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR); - } - - match Response::builder() - .status(StatusCode::OK) - .header(CONTENT_TYPE, encoder.format_type()) - .body(Body::from(buffer)) - { - Ok(response) => response, - Err(err) => { - let msg = format!("error handling /metrics request: {err}"); - error!(msg); - render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR) - } - } - } - // Collect Postgres current usage insights - (&Method::GET, "/insights") => { - info!("serving /insights GET request"); - let status = compute.get_status(); - if status != ComputeStatus::Running { - let msg = format!("compute is not running, current status: {:?}", status); - error!(msg); - return Response::new(Body::from(msg)); - } - - let insights = compute.collect_insights().await; - Response::new(Body::from(insights)) - } - - (&Method::POST, "/check_writability") => { - info!("serving /check_writability POST request"); - let status = compute.get_status(); - if status != ComputeStatus::Running { - let msg = format!( - "invalid compute status for check_writability request: {:?}", - status - ); - error!(msg); - return Response::new(Body::from(msg)); - } - - let res = crate::checker::check_writability(compute).await; - match res { - Ok(_) => Response::new(Body::from("true")), - Err(e) => { - error!("check_writability failed: {}", e); - Response::new(Body::from(e.to_string())) - } - } - } - - (&Method::POST, "/extensions") => { - info!("serving /extensions POST request"); - let status = compute.get_status(); - if status != ComputeStatus::Running { - let msg = format!( - "invalid compute status for extensions request: {:?}", - status - ); - error!(msg); - return render_json_error(&msg, StatusCode::PRECONDITION_FAILED); - } - - let request = hyper::body::to_bytes(req.into_body()).await.unwrap(); - let request = serde_json::from_slice::(&request).unwrap(); - let res = compute - .install_extension(&request.extension, &request.database, request.version) - .await; - match res { - Ok(version) => render_json(Body::from( - serde_json::to_string(&ExtensionInstallResult { - extension: request.extension, - version, - }) - .unwrap(), - )), - Err(e) => { - error!("install_extension failed: {}", e); - render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR) - } - } - } - - (&Method::GET, "/info") => { - let num_cpus = num_cpus::get_physical(); - info!("serving /info GET request. num_cpus: {}", num_cpus); - Response::new(Body::from( - serde_json::json!({ - "num_cpus": num_cpus, - }) - .to_string(), - )) - } - - // Accept spec in JSON format and request compute configuration. If - // anything goes wrong after we set the compute status to `ConfigurationPending` - // and update compute state with new spec, we basically leave compute - // in the potentially wrong state. That said, it's control-plane's - // responsibility to watch compute state after reconfiguration request - // and to clean restart in case of errors. - (&Method::POST, "/configure") => { - info!("serving /configure POST request"); - match handle_configure_request(req, compute).await { - Ok(msg) => Response::new(Body::from(msg)), - Err((msg, code)) => { - error!("error handling /configure request: {msg}"); - render_json_error(&msg, code) - } - } - } - - (&Method::POST, "/terminate") => { - info!("serving /terminate POST request"); - match handle_terminate_request(compute).await { - Ok(()) => Response::new(Body::empty()), - Err((msg, code)) => { - error!("error handling /terminate request: {msg}"); - render_json_error(&msg, code) - } - } - } - - (&Method::GET, "/dbs_and_roles") => { - info!("serving /dbs_and_roles GET request",); - match get_dbs_and_roles(compute).await { - Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())), - Err(_) => { - render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR) - } - } - } - - (&Method::GET, "/database_schema") => { - let database = match must_get_query_param(&req, "database") { - Err(e) => return e.into_response(), - Ok(database) => database, - }; - info!("serving /database_schema GET request with database: {database}",); - match get_database_schema(compute, &database).await { - Ok(res) => render_plain(Body::wrap_stream(res)), - Err(SchemaDumpError::DatabaseDoesNotExist) => { - render_json_error("database does not exist", StatusCode::NOT_FOUND) - } - Err(e) => { - error!("can't get schema dump: {}", e); - render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR) - } - } - } - - (&Method::POST, "/grants") => { - info!("serving /grants POST request"); - let status = compute.get_status(); - if status != ComputeStatus::Running { - let msg = format!( - "invalid compute status for set_role_grants request: {:?}", - status - ); - error!(msg); - return render_json_error(&msg, StatusCode::PRECONDITION_FAILED); - } - - let request = hyper::body::to_bytes(req.into_body()).await.unwrap(); - let request = serde_json::from_slice::(&request).unwrap(); - - let res = compute - .set_role_grants( - &request.database, - &request.schema, - &request.privileges, - &request.role, - ) - .await; - match res { - Ok(()) => render_json(Body::from( - serde_json::to_string(&SetRoleGrantsResponse { - database: request.database, - schema: request.schema, - role: request.role, - privileges: request.privileges, - }) - .unwrap(), - )), - Err(e) => render_json_error( - &format!("could not grant role privileges to the schema: {e}"), - // TODO: can we filter on role/schema not found errors - // and return appropriate error code? - StatusCode::INTERNAL_SERVER_ERROR, - ), - } - } - - // get the list of installed extensions - // currently only used in python tests - // TODO: call it from cplane - (&Method::GET, "/installed_extensions") => { - info!("serving /installed_extensions GET request"); - let status = compute.get_status(); - if status != ComputeStatus::Running { - let msg = format!( - "invalid compute status for extensions request: {:?}", - status - ); - error!(msg); - return Response::new(Body::from(msg)); - } - - let conf = compute.get_conn_conf(None); - let res = - task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf)) - .await - .unwrap(); - - match res { - Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())), - Err(e) => render_json_error( - &format!("could not get list of installed extensions: {}", e), - StatusCode::INTERNAL_SERVER_ERROR, - ), - } - } - - (&Method::POST, "/failpoints") if cfg!(feature = "testing") => { - match failpoints_handler(req, CancellationToken::new()).await { - Ok(r) => r, - Err(ApiError::BadRequest(e)) => { - render_json_error(&e.to_string(), StatusCode::BAD_REQUEST) - } - Err(_) => { - render_json_error("Internal server error", StatusCode::INTERNAL_SERVER_ERROR) - } - } - } - - // download extension files from remote extension storage on demand - (&Method::POST, route) if route.starts_with("/extension_server/") => { - info!("serving {:?} POST request", route); - info!("req.uri {:?}", req.uri()); - - // don't even try to download extensions - // if no remote storage is configured - if compute.ext_remote_storage.is_none() { - info!("no extensions remote storage configured"); - let mut resp = Response::new(Body::from("no remote storage configured")); - *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR; - return resp; - } - - let mut is_library = false; - if let Some(params) = req.uri().query() { - info!("serving {:?} POST request with params: {}", route, params); - if params == "is_library=true" { - is_library = true; - } else { - let mut resp = Response::new(Body::from("Wrong request parameters")); - *resp.status_mut() = StatusCode::BAD_REQUEST; - return resp; - } - } - let filename = route.split('/').last().unwrap().to_string(); - info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}"); - - // get ext_name and path from spec - // don't lock compute_state for too long - let ext = { - let compute_state = compute.state.lock().unwrap(); - let pspec = compute_state.pspec.as_ref().expect("spec must be set"); - let spec = &pspec.spec; - - // debug only - info!("spec: {:?}", spec); - - let remote_extensions = match spec.remote_extensions.as_ref() { - Some(r) => r, - None => { - info!("no remote extensions spec was provided"); - let mut resp = Response::new(Body::from("no remote storage configured")); - *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR; - return resp; - } - }; - - remote_extensions.get_ext( - &filename, - is_library, - &compute.build_tag, - &compute.pgversion, - ) - }; - - match ext { - Ok((ext_name, ext_path)) => { - match compute.download_extension(ext_name, ext_path).await { - Ok(_) => Response::new(Body::from("OK")), - Err(e) => { - error!("extension download failed: {}", e); - let mut resp = Response::new(Body::from(e.to_string())); - *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR; - resp - } - } - } - Err(e) => { - warn!("extension download failed to find extension: {}", e); - let mut resp = Response::new(Body::from("failed to find file")); - *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR; - resp - } - } - } - - // Return the `404 Not Found` for any other routes. - _ => { - let mut not_found = Response::new(Body::from("404 Not Found")); - *not_found.status_mut() = StatusCode::NOT_FOUND; - not_found - } - } -} - -async fn handle_configure_request( - req: Request, - compute: &Arc, -) -> Result { - if !compute.live_config_allowed { - return Err(( - "live configuration is not allowed for this compute node".to_string(), - StatusCode::PRECONDITION_FAILED, - )); - } - - let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap(); - let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap(); - if let Ok(request) = serde_json::from_str::(&spec_raw) { - let spec = request.spec; - - let parsed_spec = match ParsedSpec::try_from(spec) { - Ok(ps) => ps, - Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)), - }; - - // XXX: wrap state update under lock in code blocks. Otherwise, - // we will try to `Send` `mut state` into the spawned thread - // bellow, which will cause error: - // ``` - // error: future cannot be sent between threads safely - // ``` - { - let mut state = compute.state.lock().unwrap(); - if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running { - let msg = format!( - "invalid compute status for configuration request: {:?}", - state.status.clone() - ); - return Err((msg, StatusCode::PRECONDITION_FAILED)); - } - state.pspec = Some(parsed_spec); - state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed); - drop(state); - info!("set new spec and notified waiters"); - } - - // Spawn a blocking thread to wait for compute to become Running. - // This is needed to do not block the main pool of workers and - // be able to serve other requests while some particular request - // is waiting for compute to finish configuration. - let c = compute.clone(); - task::spawn_blocking(move || { - let mut state = c.state.lock().unwrap(); - while state.status != ComputeStatus::Running { - state = c.state_changed.wait(state).unwrap(); - info!( - "waiting for compute to become Running, current status: {:?}", - state.status - ); - - if state.status == ComputeStatus::Failed { - let err = state.error.as_ref().map_or("unknown error", |x| x); - let msg = format!("compute configuration failed: {:?}", err); - return Err((msg, StatusCode::INTERNAL_SERVER_ERROR)); - } - } - - Ok(()) - }) - .await - .unwrap()?; - - // Return current compute state if everything went well. - let state = compute.state.lock().unwrap().clone(); - let status_response = status_response_from_state(&state); - Ok(serde_json::to_string(&status_response).unwrap()) - } else { - Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST)) - } -} - -fn render_json_error(e: &str, status: StatusCode) -> Response { - let error = GenericAPIError { - error: e.to_string(), - }; - Response::builder() - .status(status) - .header(CONTENT_TYPE, "application/json") - .body(Body::from(serde_json::to_string(&error).unwrap())) - .unwrap() -} - -fn render_json(body: Body) -> Response { - Response::builder() - .header(CONTENT_TYPE, "application/json") - .body(body) - .unwrap() -} - -fn render_plain(body: Body) -> Response { - Response::builder() - .header(CONTENT_TYPE, "text/plain") - .body(body) - .unwrap() -} - -async fn handle_terminate_request(compute: &Arc) -> Result<(), (String, StatusCode)> { - { - let mut state = compute.state.lock().unwrap(); - if state.status == ComputeStatus::Terminated { - return Ok(()); - } - if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running { - let msg = format!( - "invalid compute status for termination request: {}", - state.status - ); - return Err((msg, StatusCode::PRECONDITION_FAILED)); - } - state.set_status(ComputeStatus::TerminationPending, &compute.state_changed); - drop(state); - } - - forward_termination_signal(); - info!("sent signal and notified waiters"); - - // Spawn a blocking thread to wait for compute to become Terminated. - // This is needed to do not block the main pool of workers and - // be able to serve other requests while some particular request - // is waiting for compute to finish configuration. - let c = compute.clone(); - task::spawn_blocking(move || { - let mut state = c.state.lock().unwrap(); - while state.status != ComputeStatus::Terminated { - state = c.state_changed.wait(state).unwrap(); - info!( - "waiting for compute to become {}, current status: {:?}", - ComputeStatus::Terminated, - state.status - ); - } - - Ok(()) - }) - .await - .unwrap()?; - info!("terminated Postgres"); - Ok(()) -} - -// Main Hyper HTTP server function that runs it and blocks waiting on it forever. -#[tokio::main] -async fn serve(port: u16, state: Arc) { - // this usually binds to both IPv4 and IPv6 on linux - // see e.g. https://github.com/rust-lang/rust/pull/34440 - let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port); - - let make_service = make_service_fn(move |_conn| { - let state = state.clone(); - async move { - Ok::<_, Infallible>(service_fn(move |req: Request| { - let state = state.clone(); - async move { - Ok::<_, Infallible>( - // NOTE: We include the URI path in the string. It - // doesn't contain any variable parts or sensitive - // information in this API. - tracing_utils::http::tracing_handler( - req, - |req| routes(req, &state), - OtelName::UriPath, - ) - .await, - ) - } - })) - } - }); - - info!("starting HTTP server on {}", addr); - - let server = Server::bind(&addr).serve(make_service); - - // Run this server forever - if let Err(e) = server.await { - error!("server error: {}", e); - } -} - -/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`. -pub fn launch_http_server(port: u16, state: &Arc) -> Result> { - let state = Arc::clone(state); - - Ok(thread::Builder::new() - .name("http-endpoint".into()) - .spawn(move || serve(port, state))?) -} diff --git a/compute_tools/src/http/extract/json.rs b/compute_tools/src/http/extract/json.rs new file mode 100644 index 0000000000..41f13625ad --- /dev/null +++ b/compute_tools/src/http/extract/json.rs @@ -0,0 +1,48 @@ +use std::ops::{Deref, DerefMut}; + +use axum::{ + async_trait, + extract::{rejection::JsonRejection, FromRequest, Request}, +}; +use compute_api::responses::GenericAPIError; +use http::StatusCode; + +/// Custom `Json` extractor, so that we can format errors into +/// `JsonResponse`. +#[derive(Debug, Clone, Copy, Default)] +pub(crate) struct Json(pub T); + +#[async_trait] +impl FromRequest for Json +where + axum::Json: FromRequest, + S: Send + Sync, +{ + type Rejection = (StatusCode, axum::Json); + + async fn from_request(req: Request, state: &S) -> Result { + match axum::Json::::from_request(req, state).await { + Ok(value) => Ok(Self(value.0)), + Err(rejection) => Err(( + rejection.status(), + axum::Json(GenericAPIError { + error: rejection.body_text().to_lowercase(), + }), + )), + } + } +} + +impl Deref for Json { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for Json { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} diff --git a/compute_tools/src/http/extract/mod.rs b/compute_tools/src/http/extract/mod.rs new file mode 100644 index 0000000000..1b690e444d --- /dev/null +++ b/compute_tools/src/http/extract/mod.rs @@ -0,0 +1,7 @@ +pub(crate) mod json; +pub(crate) mod path; +pub(crate) mod query; + +pub(crate) use json::Json; +pub(crate) use path::Path; +pub(crate) use query::Query; diff --git a/compute_tools/src/http/extract/path.rs b/compute_tools/src/http/extract/path.rs new file mode 100644 index 0000000000..95edc657f2 --- /dev/null +++ b/compute_tools/src/http/extract/path.rs @@ -0,0 +1,48 @@ +use std::ops::{Deref, DerefMut}; + +use axum::{ + async_trait, + extract::{rejection::PathRejection, FromRequestParts}, +}; +use compute_api::responses::GenericAPIError; +use http::{request::Parts, StatusCode}; + +/// Custom `Path` extractor, so that we can format errors into +/// `JsonResponse`. +#[derive(Debug, Clone, Copy, Default)] +pub(crate) struct Path(pub T); + +#[async_trait] +impl FromRequestParts for Path +where + axum::extract::Path: FromRequestParts, + S: Send + Sync, +{ + type Rejection = (StatusCode, axum::Json); + + async fn from_request_parts(parts: &mut Parts, state: &S) -> Result { + match axum::extract::Path::::from_request_parts(parts, state).await { + Ok(value) => Ok(Self(value.0)), + Err(rejection) => Err(( + rejection.status(), + axum::Json(GenericAPIError { + error: rejection.body_text().to_ascii_lowercase(), + }), + )), + } + } +} + +impl Deref for Path { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for Path { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} diff --git a/compute_tools/src/http/extract/query.rs b/compute_tools/src/http/extract/query.rs new file mode 100644 index 0000000000..a1f1b0cef0 --- /dev/null +++ b/compute_tools/src/http/extract/query.rs @@ -0,0 +1,48 @@ +use std::ops::{Deref, DerefMut}; + +use axum::{ + async_trait, + extract::{rejection::QueryRejection, FromRequestParts}, +}; +use compute_api::responses::GenericAPIError; +use http::{request::Parts, StatusCode}; + +/// Custom `Query` extractor, so that we can format errors into +/// `JsonResponse`. +#[derive(Debug, Clone, Copy, Default)] +pub(crate) struct Query(pub T); + +#[async_trait] +impl FromRequestParts for Query +where + axum::extract::Query: FromRequestParts, + S: Send + Sync, +{ + type Rejection = (StatusCode, axum::Json); + + async fn from_request_parts(parts: &mut Parts, state: &S) -> Result { + match axum::extract::Query::::from_request_parts(parts, state).await { + Ok(value) => Ok(Self(value.0)), + Err(rejection) => Err(( + rejection.status(), + axum::Json(GenericAPIError { + error: rejection.body_text().to_ascii_lowercase(), + }), + )), + } + } +} + +impl Deref for Query { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for Query { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index e5fdf85eed..a596bea504 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -1 +1,56 @@ -pub mod api; +use axum::{body::Body, response::Response}; +use compute_api::responses::{ComputeStatus, GenericAPIError}; +use http::{header::CONTENT_TYPE, StatusCode}; +use serde::Serialize; +use tracing::error; + +pub use server::launch_http_server; + +mod extract; +mod routes; +mod server; + +/// Convenience response builder for JSON responses +struct JsonResponse; + +impl JsonResponse { + /// Helper for actually creating a response + fn create_response(code: StatusCode, body: impl Serialize) -> Response { + Response::builder() + .status(code) + .header(CONTENT_TYPE.as_str(), "application/json") + .body(Body::from(serde_json::to_string(&body).unwrap())) + .unwrap() + } + + /// Create a successful error response + pub(self) fn success(code: StatusCode, body: impl Serialize) -> Response { + assert!({ + let code = code.as_u16(); + + (200..300).contains(&code) + }); + + Self::create_response(code, body) + } + + /// Create an error response + pub(self) fn error(code: StatusCode, error: impl ToString) -> Response { + assert!(code.as_u16() >= 400); + + let message = error.to_string(); + error!(message); + + Self::create_response(code, &GenericAPIError { error: message }) + } + + /// Create an error response related to the compute being in an invalid state + pub(self) fn invalid_status(status: ComputeStatus) -> Response { + Self::create_response( + StatusCode::PRECONDITION_FAILED, + &GenericAPIError { + error: format!("invalid compute status: {status}"), + }, + ) + } +} diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 24a67cac71..50319cdd85 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -37,7 +37,7 @@ paths: schema: $ref: "#/components/schemas/ComputeMetrics" - /metrics + /metrics: get: tags: - Info diff --git a/compute_tools/src/http/routes/check_writability.rs b/compute_tools/src/http/routes/check_writability.rs new file mode 100644 index 0000000000..d7feb055e9 --- /dev/null +++ b/compute_tools/src/http/routes/check_writability.rs @@ -0,0 +1,20 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use compute_api::responses::ComputeStatus; +use http::StatusCode; + +use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse}; + +/// Check that the compute is currently running. +pub(in crate::http) async fn is_writable(State(compute): State>) -> Response { + let status = compute.get_status(); + if status != ComputeStatus::Running { + return JsonResponse::invalid_status(status); + } + + match check_writability(&compute).await { + Ok(_) => JsonResponse::success(StatusCode::OK, true), + Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e), + } +} diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs new file mode 100644 index 0000000000..2546cbc344 --- /dev/null +++ b/compute_tools/src/http/routes/configure.rs @@ -0,0 +1,91 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use compute_api::{ + requests::ConfigurationRequest, + responses::{ComputeStatus, ComputeStatusResponse}, +}; +use http::StatusCode; +use tokio::task; +use tracing::info; + +use crate::{ + compute::{ComputeNode, ParsedSpec}, + http::{extract::Json, JsonResponse}, +}; + +// Accept spec in JSON format and request compute configuration. If anything +// goes wrong after we set the compute status to `ConfigurationPending` and +// update compute state with new spec, we basically leave compute in the +// potentially wrong state. That said, it's control-plane's responsibility to +// watch compute state after reconfiguration request and to clean restart in +// case of errors. +pub(in crate::http) async fn configure( + State(compute): State>, + request: Json, +) -> Response { + if !compute.live_config_allowed { + return JsonResponse::error( + StatusCode::PRECONDITION_FAILED, + "live configuration is not allowed for this compute node".to_string(), + ); + } + + let pspec = match ParsedSpec::try_from(request.spec.clone()) { + Ok(p) => p, + Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e), + }; + + // XXX: wrap state update under lock in a code block. Otherwise, we will try + // to `Send` `mut state` into the spawned thread bellow, which will cause + // the following rustc error: + // + // error: future cannot be sent between threads safely + { + let mut state = compute.state.lock().unwrap(); + if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) { + return JsonResponse::invalid_status(state.status); + } + + state.pspec = Some(pspec); + state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed); + drop(state); + } + + // Spawn a blocking thread to wait for compute to become Running. This is + // needed to do not block the main pool of workers and be able to serve + // other requests while some particular request is waiting for compute to + // finish configuration. + let c = compute.clone(); + let completed = task::spawn_blocking(move || { + let mut state = c.state.lock().unwrap(); + while state.status != ComputeStatus::Running { + state = c.state_changed.wait(state).unwrap(); + info!( + "waiting for compute to become {}, current status: {}", + ComputeStatus::Running, + state.status + ); + + if state.status == ComputeStatus::Failed { + let err = state.error.as_ref().map_or("unknown error", |x| x); + let msg = format!("compute configuration failed: {:?}", err); + return Err(msg); + } + } + + Ok(()) + }) + .await + .unwrap(); + + if let Err(e) = completed { + return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e); + } + + // Return current compute state if everything went well. + let state = compute.state.lock().unwrap().clone(); + let body = ComputeStatusResponse::from(&state); + + JsonResponse::success(StatusCode::OK, body) +} diff --git a/compute_tools/src/http/routes/database_schema.rs b/compute_tools/src/http/routes/database_schema.rs new file mode 100644 index 0000000000..fd716272dc --- /dev/null +++ b/compute_tools/src/http/routes/database_schema.rs @@ -0,0 +1,34 @@ +use std::sync::Arc; + +use axum::{body::Body, extract::State, response::Response}; +use http::{header::CONTENT_TYPE, StatusCode}; +use serde::Deserialize; + +use crate::{ + catalog::{get_database_schema, SchemaDumpError}, + compute::ComputeNode, + http::{extract::Query, JsonResponse}, +}; + +#[derive(Debug, Clone, Deserialize)] +pub(in crate::http) struct DatabaseSchemaParams { + database: String, +} + +/// Get a schema dump of the requested database. +pub(in crate::http) async fn get_schema_dump( + params: Query, + State(compute): State>, +) -> Response { + match get_database_schema(&compute, ¶ms.database).await { + Ok(schema) => Response::builder() + .status(StatusCode::OK) + .header(CONTENT_TYPE.as_str(), "application/json") + .body(Body::from_stream(schema)) + .unwrap(), + Err(SchemaDumpError::DatabaseDoesNotExist) => { + JsonResponse::error(StatusCode::NOT_FOUND, SchemaDumpError::DatabaseDoesNotExist) + } + Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e), + } +} diff --git a/compute_tools/src/http/routes/dbs_and_roles.rs b/compute_tools/src/http/routes/dbs_and_roles.rs new file mode 100644 index 0000000000..4843c3fab4 --- /dev/null +++ b/compute_tools/src/http/routes/dbs_and_roles.rs @@ -0,0 +1,16 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use http::StatusCode; + +use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse}; + +/// Get the databases and roles from the compute. +pub(in crate::http) async fn get_catalog_objects( + State(compute): State>, +) -> Response { + match get_dbs_and_roles(&compute).await { + Ok(catalog_objects) => JsonResponse::success(StatusCode::OK, catalog_objects), + Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e), + } +} diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs new file mode 100644 index 0000000000..ee5bc675ba --- /dev/null +++ b/compute_tools/src/http/routes/extension_server.rs @@ -0,0 +1,67 @@ +use std::sync::Arc; + +use axum::{ + extract::State, + response::{IntoResponse, Response}, +}; +use http::StatusCode; +use serde::Deserialize; + +use crate::{ + compute::ComputeNode, + http::{ + extract::{Path, Query}, + JsonResponse, + }, +}; + +#[derive(Debug, Clone, Deserialize)] +pub(in crate::http) struct ExtensionServerParams { + is_library: Option, +} + +/// Download a remote extension. +pub(in crate::http) async fn download_extension( + Path(filename): Path, + params: Query, + State(compute): State>, +) -> Response { + // Don't even try to download extensions if no remote storage is configured + if compute.ext_remote_storage.is_none() { + return JsonResponse::error( + StatusCode::PRECONDITION_FAILED, + "remote storage is not configured", + ); + } + + let ext = { + let state = compute.state.lock().unwrap(); + let pspec = state.pspec.as_ref().unwrap(); + let spec = &pspec.spec; + + let remote_extensions = match spec.remote_extensions.as_ref() { + Some(r) => r, + None => { + return JsonResponse::error( + StatusCode::CONFLICT, + "information about remote extensions is unavailable", + ); + } + }; + + remote_extensions.get_ext( + &filename, + params.is_library.unwrap_or(false), + &compute.build_tag, + &compute.pgversion, + ) + }; + + match ext { + Ok((ext_name, ext_path)) => match compute.download_extension(ext_name, ext_path).await { + Ok(_) => StatusCode::OK.into_response(), + Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e), + }, + Err(e) => JsonResponse::error(StatusCode::NOT_FOUND, e), + } +} diff --git a/compute_tools/src/http/routes/extensions.rs b/compute_tools/src/http/routes/extensions.rs new file mode 100644 index 0000000000..1fc03b9109 --- /dev/null +++ b/compute_tools/src/http/routes/extensions.rs @@ -0,0 +1,45 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use compute_api::{ + requests::ExtensionInstallRequest, + responses::{ComputeStatus, ExtensionInstallResponse}, +}; +use http::StatusCode; + +use crate::{ + compute::ComputeNode, + http::{extract::Json, JsonResponse}, +}; + +/// Install a extension. +pub(in crate::http) async fn install_extension( + State(compute): State>, + request: Json, +) -> Response { + let status = compute.get_status(); + if status != ComputeStatus::Running { + return JsonResponse::invalid_status(status); + } + + match compute + .install_extension( + &request.extension, + &request.database, + request.version.to_string(), + ) + .await + { + Ok(version) => JsonResponse::success( + StatusCode::CREATED, + Some(ExtensionInstallResponse { + extension: request.extension.clone(), + version, + }), + ), + Err(e) => JsonResponse::error( + StatusCode::INTERNAL_SERVER_ERROR, + format!("failed to install extension: {e}"), + ), + } +} diff --git a/compute_tools/src/http/routes/failpoints.rs b/compute_tools/src/http/routes/failpoints.rs new file mode 100644 index 0000000000..2ec4511676 --- /dev/null +++ b/compute_tools/src/http/routes/failpoints.rs @@ -0,0 +1,35 @@ +use axum::response::{IntoResponse, Response}; +use http::StatusCode; +use tracing::info; +use utils::failpoint_support::{apply_failpoint, ConfigureFailpointsRequest}; + +use crate::http::{extract::Json, JsonResponse}; + +/// Configure failpoints for testing purposes. +pub(in crate::http) async fn configure_failpoints( + failpoints: Json, +) -> Response { + if !fail::has_failpoints() { + return JsonResponse::error( + StatusCode::PRECONDITION_FAILED, + "Cannot manage failpoints because neon was compiled without failpoints support", + ); + } + + for fp in &*failpoints { + info!("cfg failpoint: {} {}", fp.name, fp.actions); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + let cfg_result = apply_failpoint(&fp.name, &fp.actions); + + if let Err(e) = cfg_result { + return JsonResponse::error( + StatusCode::BAD_REQUEST, + format!("failed to configure failpoints: {e}"), + ); + } + } + + StatusCode::OK.into_response() +} diff --git a/compute_tools/src/http/routes/grants.rs b/compute_tools/src/http/routes/grants.rs new file mode 100644 index 0000000000..3f67f011e5 --- /dev/null +++ b/compute_tools/src/http/routes/grants.rs @@ -0,0 +1,48 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use compute_api::{ + requests::SetRoleGrantsRequest, + responses::{ComputeStatus, SetRoleGrantsResponse}, +}; +use http::StatusCode; + +use crate::{ + compute::ComputeNode, + http::{extract::Json, JsonResponse}, +}; + +/// Add grants for a role. +pub(in crate::http) async fn add_grant( + State(compute): State>, + request: Json, +) -> Response { + let status = compute.get_status(); + if status != ComputeStatus::Running { + return JsonResponse::invalid_status(status); + } + + match compute + .set_role_grants( + &request.database, + &request.schema, + &request.privileges, + &request.role, + ) + .await + { + Ok(()) => JsonResponse::success( + StatusCode::CREATED, + Some(SetRoleGrantsResponse { + database: request.database.clone(), + schema: request.schema.clone(), + role: request.role.clone(), + privileges: request.privileges.clone(), + }), + ), + Err(e) => JsonResponse::error( + StatusCode::INTERNAL_SERVER_ERROR, + format!("failed to grant role privileges to the schema: {e}"), + ), + } +} diff --git a/compute_tools/src/http/routes/info.rs b/compute_tools/src/http/routes/info.rs new file mode 100644 index 0000000000..32d6fea74c --- /dev/null +++ b/compute_tools/src/http/routes/info.rs @@ -0,0 +1,11 @@ +use axum::response::Response; +use compute_api::responses::InfoResponse; +use http::StatusCode; + +use crate::http::JsonResponse; + +/// Get information about the physical characteristics about the compute. +pub(in crate::http) async fn get_info() -> Response { + let num_cpus = num_cpus::get_physical(); + JsonResponse::success(StatusCode::OK, &InfoResponse { num_cpus }) +} diff --git a/compute_tools/src/http/routes/insights.rs b/compute_tools/src/http/routes/insights.rs new file mode 100644 index 0000000000..6b03a461c3 --- /dev/null +++ b/compute_tools/src/http/routes/insights.rs @@ -0,0 +1,18 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use compute_api::responses::ComputeStatus; +use http::StatusCode; + +use crate::{compute::ComputeNode, http::JsonResponse}; + +/// Collect current Postgres usage insights. +pub(in crate::http) async fn get_insights(State(compute): State>) -> Response { + let status = compute.get_status(); + if status != ComputeStatus::Running { + return JsonResponse::invalid_status(status); + } + + let insights = compute.collect_insights().await; + JsonResponse::success(StatusCode::OK, insights) +} diff --git a/compute_tools/src/http/routes/installed_extensions.rs b/compute_tools/src/http/routes/installed_extensions.rs new file mode 100644 index 0000000000..db74a6b195 --- /dev/null +++ b/compute_tools/src/http/routes/installed_extensions.rs @@ -0,0 +1,33 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use compute_api::responses::ComputeStatus; +use http::StatusCode; +use tokio::task; + +use crate::{compute::ComputeNode, http::JsonResponse, installed_extensions}; + +/// Get a list of installed extensions. +pub(in crate::http) async fn get_installed_extensions( + State(compute): State>, +) -> Response { + let status = compute.get_status(); + if status != ComputeStatus::Running { + return JsonResponse::invalid_status(status); + } + + let conf = compute.get_conn_conf(None); + let res = task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf)) + .await + .unwrap(); + + match res { + Ok(installed_extensions) => { + JsonResponse::success(StatusCode::OK, Some(installed_extensions)) + } + Err(e) => JsonResponse::error( + StatusCode::INTERNAL_SERVER_ERROR, + format!("failed to get list of installed extensions: {e}"), + ), + } +} diff --git a/compute_tools/src/http/routes/metrics.rs b/compute_tools/src/http/routes/metrics.rs new file mode 100644 index 0000000000..40d71b5de7 --- /dev/null +++ b/compute_tools/src/http/routes/metrics.rs @@ -0,0 +1,32 @@ +use axum::{body::Body, response::Response}; +use http::header::CONTENT_TYPE; +use http::StatusCode; +use metrics::proto::MetricFamily; +use metrics::Encoder; +use metrics::TextEncoder; + +use crate::{http::JsonResponse, installed_extensions}; + +/// Expose Prometheus metrics. +pub(in crate::http) async fn get_metrics() -> Response { + // When we call TextEncoder::encode() below, it will immediately return an + // error if a metric family has no metrics, so we need to preemptively + // filter out metric families with no metrics. + let metrics = installed_extensions::collect() + .into_iter() + .filter(|m| !m.get_metric().is_empty()) + .collect::>(); + + let encoder = TextEncoder::new(); + let mut buffer = vec![]; + + if let Err(e) = encoder.encode(&metrics, &mut buffer) { + return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e); + } + + Response::builder() + .status(StatusCode::OK) + .header(CONTENT_TYPE, encoder.format_type()) + .body(Body::from(buffer)) + .unwrap() +} diff --git a/compute_tools/src/http/routes/metrics_json.rs b/compute_tools/src/http/routes/metrics_json.rs new file mode 100644 index 0000000000..0709db5011 --- /dev/null +++ b/compute_tools/src/http/routes/metrics_json.rs @@ -0,0 +1,12 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use http::StatusCode; + +use crate::{compute::ComputeNode, http::JsonResponse}; + +/// Get startup metrics. +pub(in crate::http) async fn get_metrics(State(compute): State>) -> Response { + let metrics = compute.state.lock().unwrap().metrics.clone(); + JsonResponse::success(StatusCode::OK, metrics) +} diff --git a/compute_tools/src/http/routes/mod.rs b/compute_tools/src/http/routes/mod.rs new file mode 100644 index 0000000000..3efa1153ad --- /dev/null +++ b/compute_tools/src/http/routes/mod.rs @@ -0,0 +1,38 @@ +use compute_api::responses::ComputeStatusResponse; + +use crate::compute::ComputeState; + +pub(in crate::http) mod check_writability; +pub(in crate::http) mod configure; +pub(in crate::http) mod database_schema; +pub(in crate::http) mod dbs_and_roles; +pub(in crate::http) mod extension_server; +pub(in crate::http) mod extensions; +pub(in crate::http) mod failpoints; +pub(in crate::http) mod grants; +pub(in crate::http) mod info; +pub(in crate::http) mod insights; +pub(in crate::http) mod installed_extensions; +pub(in crate::http) mod metrics; +pub(in crate::http) mod metrics_json; +pub(in crate::http) mod status; +pub(in crate::http) mod terminate; + +impl From<&ComputeState> for ComputeStatusResponse { + fn from(state: &ComputeState) -> Self { + ComputeStatusResponse { + start_time: state.start_time, + tenant: state + .pspec + .as_ref() + .map(|pspec| pspec.tenant_id.to_string()), + timeline: state + .pspec + .as_ref() + .map(|pspec| pspec.timeline_id.to_string()), + status: state.status, + last_active: state.last_active, + error: state.error.clone(), + } + } +} diff --git a/compute_tools/src/http/routes/status.rs b/compute_tools/src/http/routes/status.rs new file mode 100644 index 0000000000..d64d53a58f --- /dev/null +++ b/compute_tools/src/http/routes/status.rs @@ -0,0 +1,14 @@ +use std::{ops::Deref, sync::Arc}; + +use axum::{extract::State, http::StatusCode, response::Response}; +use compute_api::responses::ComputeStatusResponse; + +use crate::{compute::ComputeNode, http::JsonResponse}; + +/// Retrieve the state of the comute. +pub(in crate::http) async fn get_status(State(compute): State>) -> Response { + let state = compute.state.lock().unwrap(); + let body = ComputeStatusResponse::from(state.deref()); + + JsonResponse::success(StatusCode::OK, body) +} diff --git a/compute_tools/src/http/routes/terminate.rs b/compute_tools/src/http/routes/terminate.rs new file mode 100644 index 0000000000..7acd84f236 --- /dev/null +++ b/compute_tools/src/http/routes/terminate.rs @@ -0,0 +1,58 @@ +use std::sync::Arc; + +use axum::{ + extract::State, + response::{IntoResponse, Response}, +}; +use compute_api::responses::ComputeStatus; +use http::StatusCode; +use tokio::task; +use tracing::info; + +use crate::{ + compute::{forward_termination_signal, ComputeNode}, + http::JsonResponse, +}; + +/// Terminate the compute. +pub(in crate::http) async fn terminate(State(compute): State>) -> Response { + { + let mut state = compute.state.lock().unwrap(); + if state.status == ComputeStatus::Terminated { + return StatusCode::CREATED.into_response(); + } + + if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) { + return JsonResponse::invalid_status(state.status); + } + + state.set_status(ComputeStatus::TerminationPending, &compute.state_changed); + drop(state); + } + + forward_termination_signal(); + info!("sent signal and notified waiters"); + + // Spawn a blocking thread to wait for compute to become Terminated. + // This is needed to do not block the main pool of workers and + // be able to serve other requests while some particular request + // is waiting for compute to finish configuration. + let c = compute.clone(); + task::spawn_blocking(move || { + let mut state = c.state.lock().unwrap(); + while state.status != ComputeStatus::Terminated { + state = c.state_changed.wait(state).unwrap(); + info!( + "waiting for compute to become {}, current status: {:?}", + ComputeStatus::Terminated, + state.status + ); + } + }) + .await + .unwrap(); + + info!("terminated Postgres"); + + StatusCode::OK.into_response() +} diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs new file mode 100644 index 0000000000..33d4b489a0 --- /dev/null +++ b/compute_tools/src/http/server.rs @@ -0,0 +1,165 @@ +use std::{ + net::{IpAddr, Ipv6Addr, SocketAddr}, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + thread, + time::Duration, +}; + +use anyhow::Result; +use axum::{ + response::{IntoResponse, Response}, + routing::{get, post}, + Router, +}; +use http::StatusCode; +use tokio::net::TcpListener; +use tower::ServiceBuilder; +use tower_http::{ + request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer}, + trace::TraceLayer, +}; +use tracing::{debug, error, info, Span}; + +use super::routes::{ + check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, + grants, info as info_route, insights, installed_extensions, metrics, metrics_json, status, + terminate, +}; +use crate::compute::ComputeNode; + +async fn handle_404() -> Response { + StatusCode::NOT_FOUND.into_response() +} + +#[derive(Clone, Default)] +struct ComputeMakeRequestId(Arc); + +impl MakeRequestId for ComputeMakeRequestId { + fn make_request_id( + &mut self, + _request: &http::Request, + ) -> Option { + let request_id = self + .0 + .fetch_add(1, Ordering::SeqCst) + .to_string() + .parse() + .unwrap(); + + Some(RequestId::new(request_id)) + } +} + +/// Run the HTTP server and wait on it forever. +#[tokio::main] +async fn serve(port: u16, compute: Arc) { + const X_REQUEST_ID: &str = "x-request-id"; + + let mut app = Router::new() + .route("/check_writability", post(check_writability::is_writable)) + .route("/configure", post(configure::configure)) + .route("/database_schema", get(database_schema::get_schema_dump)) + .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) + .route( + "/extension_server/*filename", + post(extension_server::download_extension), + ) + .route("/extensions", post(extensions::install_extension)) + .route("/grants", post(grants::add_grant)) + .route("/info", get(info_route::get_info)) + .route("/insights", get(insights::get_insights)) + .route( + "/installed_extensions", + get(installed_extensions::get_installed_extensions), + ) + .route("/metrics", get(metrics::get_metrics)) + .route("/metrics.json", get(metrics_json::get_metrics)) + .route("/status", get(status::get_status)) + .route("/terminate", post(terminate::terminate)) + .fallback(handle_404) + .layer( + ServiceBuilder::new() + .layer(SetRequestIdLayer::x_request_id( + ComputeMakeRequestId::default(), + )) + .layer( + TraceLayer::new_for_http() + .on_request(|request: &http::Request<_>, _span: &Span| { + let request_id = request + .headers() + .get(X_REQUEST_ID) + .unwrap() + .to_str() + .unwrap(); + + match request.uri().path() { + "/metrics" => { + debug!(%request_id, "{} {}", request.method(), request.uri()) + } + _ => info!(%request_id, "{} {}", request.method(), request.uri()), + }; + }) + .on_response( + |response: &http::Response<_>, latency: Duration, _span: &Span| { + let request_id = response + .headers() + .get(X_REQUEST_ID) + .unwrap() + .to_str() + .unwrap(); + + info!( + %request_id, + code = response.status().as_u16(), + latency = latency.as_millis() + ) + }, + ), + ) + .layer(PropagateRequestIdLayer::x_request_id()), + ) + .with_state(compute); + + // Add in any testing support + if cfg!(feature = "testing") { + use super::routes::failpoints; + + app = app.route("/failpoints", post(failpoints::configure_failpoints)) + } + + // This usually binds to both IPv4 and IPv6 on Linux, see + // https://github.com/rust-lang/rust/pull/34440 for more information + let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port); + let listener = match TcpListener::bind(&addr).await { + Ok(listener) => listener, + Err(e) => { + error!( + "failed to bind the compute_ctl HTTP server to port {}: {}", + port, e + ); + return; + } + }; + + if let Ok(local_addr) = listener.local_addr() { + info!("compute_ctl HTTP server listening on {}", local_addr); + } else { + info!("compute_ctl HTTP server listening on port {}", port); + } + + if let Err(e) = axum::serve(listener, app).await { + error!("compute_ctl HTTP server error: {}", e); + } +} + +/// Launch a separate HTTP server thread and return its `JoinHandle`. +pub fn launch_http_server(port: u16, state: &Arc) -> Result> { + let state = Arc::clone(state); + + Ok(thread::Builder::new() + .name("http-server".into()) + .spawn(move || serve(port, state))?) +} diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index ee4cf2dfa5..12fea4e61a 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -3,8 +3,6 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] -extern crate hyper0 as hyper; - pub mod checker; pub mod config; pub mod configurator; diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs index 1f3de65806..45c33172f7 100644 --- a/compute_tools/src/migration.rs +++ b/compute_tools/src/migration.rs @@ -1,6 +1,6 @@ use anyhow::{Context, Result}; use fail::fail_point; -use postgres::Client; +use postgres::{Client, Transaction}; use tracing::info; /// Runs a series of migrations on a target database @@ -20,11 +20,9 @@ impl<'m> MigrationRunner<'m> { /// Get the current value neon_migration.migration_id fn get_migration_id(&mut self) -> Result { - let query = "SELECT id FROM neon_migration.migration_id"; let row = self .client - .query_one(query, &[]) - .context("run_migrations get migration_id")?; + .query_one("SELECT id FROM neon_migration.migration_id", &[])?; Ok(row.get::<&str, i64>("id")) } @@ -34,7 +32,7 @@ impl<'m> MigrationRunner<'m> { /// This function has a fail point called compute-migration, which can be /// used if you would like to fail the application of a series of migrations /// at some point. - fn update_migration_id(&mut self, migration_id: i64) -> Result<()> { + fn update_migration_id(txn: &mut Transaction, migration_id: i64) -> Result<()> { // We use this fail point in order to check that failing in the // middle of applying a series of migrations fails in an expected // manner @@ -55,12 +53,11 @@ impl<'m> MigrationRunner<'m> { } } - self.client - .query( - "UPDATE neon_migration.migration_id SET id = $1", - &[&migration_id], - ) - .context("run_migrations update id")?; + txn.query( + "UPDATE neon_migration.migration_id SET id = $1", + &[&migration_id], + ) + .with_context(|| format!("update neon_migration.migration_id to {migration_id}"))?; Ok(()) } @@ -81,53 +78,50 @@ impl<'m> MigrationRunner<'m> { Ok(()) } - /// Run the configrured set of migrations + /// Run an individual migration + fn run_migration(txn: &mut Transaction, migration_id: i64, migration: &str) -> Result<()> { + if migration.starts_with("-- SKIP") { + info!("Skipping migration id={}", migration_id); + + // Even though we are skipping the migration, updating the + // migration ID should help keep logic easy to understand when + // trying to understand the state of a cluster. + Self::update_migration_id(txn, migration_id)?; + } else { + info!("Running migration id={}:\n{}\n", migration_id, migration); + + txn.simple_query(migration) + .with_context(|| format!("apply migration {migration_id}"))?; + + Self::update_migration_id(txn, migration_id)?; + } + + Ok(()) + } + + /// Run the configured set of migrations pub fn run_migrations(mut self) -> Result<()> { - self.prepare_database()?; + self.prepare_database() + .context("prepare database to handle migrations")?; let mut current_migration = self.get_migration_id()? as usize; while current_migration < self.migrations.len() { - macro_rules! migration_id { - ($cm:expr) => { - ($cm + 1) as i64 - }; - } + // The index lags the migration ID by 1, so the current migration + // ID is also the next index + let migration_id = (current_migration + 1) as i64; - let migration = self.migrations[current_migration]; + let mut txn = self + .client + .transaction() + .with_context(|| format!("begin transaction for migration {migration_id}"))?; - if migration.starts_with("-- SKIP") { - info!("Skipping migration id={}", migration_id!(current_migration)); + Self::run_migration(&mut txn, migration_id, self.migrations[current_migration]) + .with_context(|| format!("running migration {migration_id}"))?; - // Even though we are skipping the migration, updating the - // migration ID should help keep logic easy to understand when - // trying to understand the state of a cluster. - self.update_migration_id(migration_id!(current_migration))?; - } else { - info!( - "Running migration id={}:\n{}\n", - migration_id!(current_migration), - migration - ); + txn.commit() + .with_context(|| format!("commit transaction for migration {migration_id}"))?; - self.client - .simple_query("BEGIN") - .context("begin migration")?; - - self.client.simple_query(migration).with_context(|| { - format!( - "run_migrations migration id={}", - migration_id!(current_migration) - ) - })?; - - self.update_migration_id(migration_id!(current_migration))?; - - self.client - .simple_query("COMMIT") - .context("commit migration")?; - - info!("Finished migration id={}", migration_id!(current_migration)); - } + info!("Finished migration id={}", migration_id); current_migration += 1; } diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 7308d5d36e..7401de2e60 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -47,6 +47,7 @@ pub enum PerDatabasePhase { DeleteDBRoleReferences, ChangeSchemaPerms, HandleAnonExtension, + DropSubscriptionsForDeletedDatabases, } #[derive(Clone, Debug)] @@ -74,7 +75,7 @@ pub struct MutableApplyContext { pub dbs: HashMap, } -/// Appply the operations that belong to the given spec apply phase. +/// Apply the operations that belong to the given spec apply phase. /// /// Commands within a single phase are executed in order of Iterator yield. /// Commands of ApplySpecPhase::RunInEachDatabase will execute in the database @@ -326,13 +327,12 @@ async fn get_operations<'a>( // Use FORCE to drop database even if there are active connections. // We run this from `cloud_admin`, so it should have enough privileges. + // // NB: there could be other db states, which prevent us from dropping // the database. For example, if db is used by any active subscription // or replication slot. - // TODO: deal with it once we allow logical replication. Proper fix should - // involve returning an error code to the control plane, so it could - // figure out that this is a non-retryable error, return it to the user - // and fail operation permanently. + // Such cases are handled in the DropSubscriptionsForDeletedDatabases + // phase. We do all the cleanup before actually dropping the database. let drop_db_query: String = format!( "DROP DATABASE IF EXISTS {} WITH (FORCE)", &op.name.pg_quote() @@ -444,6 +444,30 @@ async fn get_operations<'a>( } ApplySpecPhase::RunInEachDatabase { db, subphase } => { match subphase { + PerDatabasePhase::DropSubscriptionsForDeletedDatabases => { + match &db { + DB::UserDB(db) => { + let drop_subscription_query: String = format!( + include_str!("sql/drop_subscription_for_drop_dbs.sql"), + datname_str = escape_literal(&db.name), + ); + + let operations = vec![Operation { + query: drop_subscription_query, + comment: Some(format!( + "optionally dropping subscriptions for DB {}", + db.name, + )), + }] + .into_iter(); + + Ok(Box::new(operations)) + } + // skip this cleanup for the system databases + // because users can't drop them + DB::SystemDB => Ok(Box::new(empty())), + } + } PerDatabasePhase::DeleteDBRoleReferences => { let ctx = ctx.read().await; @@ -474,7 +498,19 @@ async fn get_operations<'a>( ), comment: None, }, + // Revoke some potentially blocking privileges (Neon-specific currently) + Operation { + query: format!( + include_str!("sql/pre_drop_role_revoke_privileges.sql"), + role_name = quoted, + ), + comment: None, + }, // This now will only drop privileges of the role + // TODO: this is obviously not 100% true because of the above case, + // there could be still some privileges that are not revoked. Maybe this + // only drops privileges that were granted *by this* role, not *to this* role, + // but this has to be checked. Operation { query: format!("DROP OWNED BY {}", quoted), comment: None, diff --git a/compute_tools/src/sql/drop_subscription_for_drop_dbs.sql b/compute_tools/src/sql/drop_subscription_for_drop_dbs.sql new file mode 100644 index 0000000000..dfb925e48e --- /dev/null +++ b/compute_tools/src/sql/drop_subscription_for_drop_dbs.sql @@ -0,0 +1,11 @@ +DO $$ +DECLARE + subname TEXT; +BEGIN + FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP + EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname); + EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname); + EXECUTE format('DROP SUBSCRIPTION %I;', subname); + END LOOP; +END; +$$; diff --git a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql new file mode 100644 index 0000000000..cdaa7071d3 --- /dev/null +++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql @@ -0,0 +1,28 @@ +SET SESSION ROLE neon_superuser; + +DO $$ +DECLARE + schema TEXT; + revoke_query TEXT; +BEGIN + FOR schema IN + SELECT schema_name + FROM information_schema.schemata + -- So far, we only had issues with 'public' schema. Probably, because we do some additional grants, + -- e.g., make DB owner the owner of 'public' schema automatically (when created via API). + -- See https://github.com/neondatabase/cloud/issues/13582 for the context. + -- Still, keep the loop because i) it efficiently handles the case when there is no 'public' schema, + -- ii) it's easy to add more schemas to the list if needed. + WHERE schema_name IN ('public') + LOOP + revoke_query := format( + 'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;', + schema + ); + + EXECUTE revoke_query; + END LOOP; +END; +$$; + +RESET ROLE; diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 5e47ec4811..b8027abf7c 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -62,7 +62,7 @@ use crate::local_env::LocalEnv; use crate::postgresql_conf::PostgresConf; use crate::storage_controller::StorageController; -use compute_api::responses::{ComputeState, ComputeStatus}; +use compute_api::responses::{ComputeStatus, ComputeStatusResponse}; use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec}; // contents of a endpoint.json file @@ -739,7 +739,7 @@ impl Endpoint { } // Call the /status HTTP API - pub async fn get_status(&self) -> Result { + pub async fn get_status(&self) -> Result { let client = reqwest::Client::new(); let response = client diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 6ee1044c18..617b2cd1ba 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -1035,7 +1035,15 @@ async fn main() -> anyhow::Result<()> { resp.sort_by(|a, b| a.id.cmp(&b.id)); let mut table = comfy_table::Table::new(); - table.set_header(["Id", "Version", "Host", "Port", "Http Port", "AZ Id"]); + table.set_header([ + "Id", + "Version", + "Host", + "Port", + "Http Port", + "AZ Id", + "Scheduling", + ]); for sk in resp { table.add_row([ format!("{}", sk.id), @@ -1043,7 +1051,8 @@ async fn main() -> anyhow::Result<()> { sk.host, format!("{}", sk.port), format!("{}", sk.http_port), - sk.availability_zone_id.to_string(), + sk.availability_zone_id.clone(), + String::from(sk.scheduling_policy), ]); } println!("{table}"); diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 0d65f6a38d..9ce605089b 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -15,6 +15,17 @@ pub struct GenericAPIError { pub error: String, } +#[derive(Debug, Clone, Serialize)] +pub struct InfoResponse { + pub num_cpus: usize, +} + +#[derive(Debug, Clone, Serialize)] +pub struct ExtensionInstallResponse { + pub extension: PgIdent, + pub version: ExtVersion, +} + /// Response of the /status API #[derive(Serialize, Debug, Deserialize)] #[serde(rename_all = "snake_case")] @@ -28,16 +39,6 @@ pub struct ComputeStatusResponse { pub error: Option, } -#[derive(Deserialize, Serialize)] -#[serde(rename_all = "snake_case")] -pub struct ComputeState { - pub status: ComputeStatus, - /// Timestamp of the last Postgres activity - #[serde(serialize_with = "rfc3339_serialize")] - pub last_active: Option>, - pub error: Option, -} - #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)] #[serde(rename_all = "snake_case")] pub enum ComputeStatus { @@ -78,7 +79,7 @@ impl Display for ComputeStatus { } } -fn rfc3339_serialize(x: &Option>, s: S) -> Result +pub fn rfc3339_serialize(x: &Option>, s: S) -> Result where S: Serializer, { diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index faf11e487c..7eb3547183 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -320,6 +320,38 @@ impl From for String { } } +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] +pub enum SkSchedulingPolicy { + Active, + Disabled, + Decomissioned, +} + +impl FromStr for SkSchedulingPolicy { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + Ok(match s { + "active" => Self::Active, + "disabled" => Self::Disabled, + "decomissioned" => Self::Decomissioned, + _ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")), + }) + } +} + +impl From for String { + fn from(value: SkSchedulingPolicy) -> String { + use SkSchedulingPolicy::*; + match value { + Active => "active", + Disabled => "disabled", + Decomissioned => "decomissioned", + } + .to_string() + } +} + /// Controls how tenant shards are mapped to locations on pageservers, e.g. whether /// to create secondary locations. #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)] @@ -387,6 +419,7 @@ pub struct SafekeeperDescribeResponse { pub port: i32, pub http_port: i32, pub availability_zone_id: String, + pub scheduling_policy: SkSchedulingPolicy, } #[cfg(test)] diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index f3fc9fad76..39390d7647 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1460,75 +1460,91 @@ impl TryFrom for PagestreamBeMessageTag { // interface allows sending both LSNs, and let the pageserver do the right thing. There was no // difference in the responses between V1 and V2. // -#[derive(Clone, Copy)] +// V3 version of protocol adds request ID to all requests. This request ID is also included in response +// as well as other fields from requests, which allows to verify that we receive response for our request. +// We copy fields from request to response to make checking more reliable: request ID is formed from process ID +// and local counter, so in principle there can be duplicated requests IDs if process PID is reused. +// +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub enum PagestreamProtocolVersion { V2, + V3, } -#[derive(Debug, PartialEq, Eq)] +pub type RequestId = u64; + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub struct PagestreamRequest { + pub reqid: RequestId, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct PagestreamExistsRequest { - pub request_lsn: Lsn, - pub not_modified_since: Lsn, + pub hdr: PagestreamRequest, pub rel: RelTag, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct PagestreamNblocksRequest { - pub request_lsn: Lsn, - pub not_modified_since: Lsn, + pub hdr: PagestreamRequest, pub rel: RelTag, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct PagestreamGetPageRequest { - pub request_lsn: Lsn, - pub not_modified_since: Lsn, + pub hdr: PagestreamRequest, pub rel: RelTag, pub blkno: u32, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct PagestreamDbSizeRequest { - pub request_lsn: Lsn, - pub not_modified_since: Lsn, + pub hdr: PagestreamRequest, pub dbnode: u32, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct PagestreamGetSlruSegmentRequest { - pub request_lsn: Lsn, - pub not_modified_since: Lsn, + pub hdr: PagestreamRequest, pub kind: u8, pub segno: u32, } #[derive(Debug)] pub struct PagestreamExistsResponse { + pub req: PagestreamExistsRequest, pub exists: bool, } #[derive(Debug)] pub struct PagestreamNblocksResponse { + pub req: PagestreamNblocksRequest, pub n_blocks: u32, } #[derive(Debug)] pub struct PagestreamGetPageResponse { + pub req: PagestreamGetPageRequest, pub page: Bytes, } #[derive(Debug)] pub struct PagestreamGetSlruSegmentResponse { + pub req: PagestreamGetSlruSegmentRequest, pub segment: Bytes, } #[derive(Debug)] pub struct PagestreamErrorResponse { + pub req: PagestreamRequest, pub message: String, } #[derive(Debug)] pub struct PagestreamDbSizeResponse { + pub req: PagestreamDbSizeRequest, pub db_size: i64, } @@ -1545,15 +1561,16 @@ pub struct TenantHistorySize { impl PagestreamFeMessage { /// Serialize a compute -> pageserver message. This is currently only used in testing - /// tools. Always uses protocol version 2. + /// tools. Always uses protocol version 3. pub fn serialize(&self) -> Bytes { let mut bytes = BytesMut::new(); match self { Self::Exists(req) => { bytes.put_u8(0); - bytes.put_u64(req.request_lsn.0); - bytes.put_u64(req.not_modified_since.0); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -1562,8 +1579,9 @@ impl PagestreamFeMessage { Self::Nblocks(req) => { bytes.put_u8(1); - bytes.put_u64(req.request_lsn.0); - bytes.put_u64(req.not_modified_since.0); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -1572,8 +1590,9 @@ impl PagestreamFeMessage { Self::GetPage(req) => { bytes.put_u8(2); - bytes.put_u64(req.request_lsn.0); - bytes.put_u64(req.not_modified_since.0); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -1583,15 +1602,17 @@ impl PagestreamFeMessage { Self::DbSize(req) => { bytes.put_u8(3); - bytes.put_u64(req.request_lsn.0); - bytes.put_u64(req.not_modified_since.0); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u32(req.dbnode); } Self::GetSlruSegment(req) => { bytes.put_u8(4); - bytes.put_u64(req.request_lsn.0); - bytes.put_u64(req.not_modified_since.0); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u8(req.kind); bytes.put_u32(req.segno); } @@ -1600,21 +1621,35 @@ impl PagestreamFeMessage { bytes.into() } - pub fn parse(body: &mut R) -> anyhow::Result { + pub fn parse( + body: &mut R, + protocol_version: PagestreamProtocolVersion, + ) -> anyhow::Result { // these correspond to the NeonMessageTag enum in pagestore_client.h // // TODO: consider using protobuf or serde bincode for less error prone // serialization. let msg_tag = body.read_u8()?; - - // these two fields are the same for every request type - let request_lsn = Lsn::from(body.read_u64::()?); - let not_modified_since = Lsn::from(body.read_u64::()?); + let (reqid, request_lsn, not_modified_since) = match protocol_version { + PagestreamProtocolVersion::V2 => ( + 0, + Lsn::from(body.read_u64::()?), + Lsn::from(body.read_u64::()?), + ), + PagestreamProtocolVersion::V3 => ( + body.read_u64::()?, + Lsn::from(body.read_u64::()?), + Lsn::from(body.read_u64::()?), + ), + }; match msg_tag { 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { - request_lsn, - not_modified_since, + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -1623,8 +1658,11 @@ impl PagestreamFeMessage { }, })), 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - request_lsn, - not_modified_since, + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -1633,8 +1671,11 @@ impl PagestreamFeMessage { }, })), 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - request_lsn, - not_modified_since, + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -1644,14 +1685,20 @@ impl PagestreamFeMessage { blkno: body.read_u32::()?, })), 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - request_lsn, - not_modified_since, + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, dbnode: body.read_u32::()?, })), 4 => Ok(PagestreamFeMessage::GetSlruSegment( PagestreamGetSlruSegmentRequest { - request_lsn, - not_modified_since, + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, kind: body.read_u8()?, segno: body.read_u32::()?, }, @@ -1662,43 +1709,114 @@ impl PagestreamFeMessage { } impl PagestreamBeMessage { - pub fn serialize(&self) -> Bytes { + pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes { let mut bytes = BytesMut::new(); use PagestreamBeMessageTag as Tag; - match self { - Self::Exists(resp) => { - bytes.put_u8(Tag::Exists as u8); - bytes.put_u8(resp.exists as u8); - } + match protocol_version { + PagestreamProtocolVersion::V2 => { + match self { + Self::Exists(resp) => { + bytes.put_u8(Tag::Exists as u8); + bytes.put_u8(resp.exists as u8); + } - Self::Nblocks(resp) => { - bytes.put_u8(Tag::Nblocks as u8); - bytes.put_u32(resp.n_blocks); - } + Self::Nblocks(resp) => { + bytes.put_u8(Tag::Nblocks as u8); + bytes.put_u32(resp.n_blocks); + } - Self::GetPage(resp) => { - bytes.put_u8(Tag::GetPage as u8); - bytes.put(&resp.page[..]); - } + Self::GetPage(resp) => { + bytes.put_u8(Tag::GetPage as u8); + bytes.put(&resp.page[..]) + } - Self::Error(resp) => { - bytes.put_u8(Tag::Error as u8); - bytes.put(resp.message.as_bytes()); - bytes.put_u8(0); // null terminator - } - Self::DbSize(resp) => { - bytes.put_u8(Tag::DbSize as u8); - bytes.put_i64(resp.db_size); - } + Self::Error(resp) => { + bytes.put_u8(Tag::Error as u8); + bytes.put(resp.message.as_bytes()); + bytes.put_u8(0); // null terminator + } + Self::DbSize(resp) => { + bytes.put_u8(Tag::DbSize as u8); + bytes.put_i64(resp.db_size); + } - Self::GetSlruSegment(resp) => { - bytes.put_u8(Tag::GetSlruSegment as u8); - bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); - bytes.put(&resp.segment[..]); + Self::GetSlruSegment(resp) => { + bytes.put_u8(Tag::GetSlruSegment as u8); + bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); + bytes.put(&resp.segment[..]); + } + } + } + PagestreamProtocolVersion::V3 => { + match self { + Self::Exists(resp) => { + bytes.put_u8(Tag::Exists as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u32(resp.req.rel.spcnode); + bytes.put_u32(resp.req.rel.dbnode); + bytes.put_u32(resp.req.rel.relnode); + bytes.put_u8(resp.req.rel.forknum); + bytes.put_u8(resp.exists as u8); + } + + Self::Nblocks(resp) => { + bytes.put_u8(Tag::Nblocks as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u32(resp.req.rel.spcnode); + bytes.put_u32(resp.req.rel.dbnode); + bytes.put_u32(resp.req.rel.relnode); + bytes.put_u8(resp.req.rel.forknum); + bytes.put_u32(resp.n_blocks); + } + + Self::GetPage(resp) => { + bytes.put_u8(Tag::GetPage as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u32(resp.req.rel.spcnode); + bytes.put_u32(resp.req.rel.dbnode); + bytes.put_u32(resp.req.rel.relnode); + bytes.put_u8(resp.req.rel.forknum); + bytes.put_u32(resp.req.blkno); + bytes.put(&resp.page[..]) + } + + Self::Error(resp) => { + bytes.put_u8(Tag::Error as u8); + bytes.put_u64(resp.req.reqid); + bytes.put_u64(resp.req.request_lsn.0); + bytes.put_u64(resp.req.not_modified_since.0); + bytes.put(resp.message.as_bytes()); + bytes.put_u8(0); // null terminator + } + Self::DbSize(resp) => { + bytes.put_u8(Tag::DbSize as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u32(resp.req.dbnode); + bytes.put_i64(resp.db_size); + } + + Self::GetSlruSegment(resp) => { + bytes.put_u8(Tag::GetSlruSegment as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u8(resp.req.kind); + bytes.put_u32(resp.req.segno); + bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); + bytes.put(&resp.segment[..]); + } + } } } - bytes.into() } @@ -1710,38 +1828,131 @@ impl PagestreamBeMessage { let ok = match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? { Tag::Exists => { - let exists = buf.read_u8()?; + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let rel = RelTag { + spcnode: buf.read_u32::()?, + dbnode: buf.read_u32::()?, + relnode: buf.read_u32::()?, + forknum: buf.read_u8()?, + }; + let exists = buf.read_u8()? != 0; Self::Exists(PagestreamExistsResponse { - exists: exists != 0, + req: PagestreamExistsRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel, + }, + exists, }) } Tag::Nblocks => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let rel = RelTag { + spcnode: buf.read_u32::()?, + dbnode: buf.read_u32::()?, + relnode: buf.read_u32::()?, + forknum: buf.read_u8()?, + }; let n_blocks = buf.read_u32::()?; - Self::Nblocks(PagestreamNblocksResponse { n_blocks }) + Self::Nblocks(PagestreamNblocksResponse { + req: PagestreamNblocksRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel, + }, + n_blocks, + }) } Tag::GetPage => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let rel = RelTag { + spcnode: buf.read_u32::()?, + dbnode: buf.read_u32::()?, + relnode: buf.read_u32::()?, + forknum: buf.read_u8()?, + }; + let blkno = buf.read_u32::()?; let mut page = vec![0; 8192]; // TODO: use MaybeUninit buf.read_exact(&mut page)?; - PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() }) + Self::GetPage(PagestreamGetPageResponse { + req: PagestreamGetPageRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel, + blkno, + }, + page: page.into(), + }) } Tag::Error => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); let mut msg = Vec::new(); buf.read_until(0, &mut msg)?; let cstring = std::ffi::CString::from_vec_with_nul(msg)?; let rust_str = cstring.to_str()?; - PagestreamBeMessage::Error(PagestreamErrorResponse { + Self::Error(PagestreamErrorResponse { + req: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, message: rust_str.to_owned(), }) } Tag::DbSize => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let dbnode = buf.read_u32::()?; let db_size = buf.read_i64::()?; - Self::DbSize(PagestreamDbSizeResponse { db_size }) + Self::DbSize(PagestreamDbSizeResponse { + req: PagestreamDbSizeRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + dbnode, + }, + db_size, + }) } Tag::GetSlruSegment => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let kind = buf.read_u8()?; + let segno = buf.read_u32::()?; let n_blocks = buf.read_u32::()?; let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize]; buf.read_exact(&mut segment)?; Self::GetSlruSegment(PagestreamGetSlruSegmentResponse { + req: PagestreamGetSlruSegmentRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + kind, + segno, + }, segment: segment.into(), }) } @@ -1780,8 +1991,11 @@ mod tests { // Test serialization/deserialization of PagestreamFeMessage let messages = vec![ PagestreamFeMessage::Exists(PagestreamExistsRequest { - request_lsn: Lsn(4), - not_modified_since: Lsn(3), + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(4), + not_modified_since: Lsn(3), + }, rel: RelTag { forknum: 1, spcnode: 2, @@ -1790,8 +2004,11 @@ mod tests { }, }), PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - request_lsn: Lsn(4), - not_modified_since: Lsn(4), + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(4), + not_modified_since: Lsn(4), + }, rel: RelTag { forknum: 1, spcnode: 2, @@ -1800,8 +2017,11 @@ mod tests { }, }), PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - request_lsn: Lsn(4), - not_modified_since: Lsn(3), + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(4), + not_modified_since: Lsn(3), + }, rel: RelTag { forknum: 1, spcnode: 2, @@ -1811,14 +2031,19 @@ mod tests { blkno: 7, }), PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - request_lsn: Lsn(4), - not_modified_since: Lsn(3), + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(4), + not_modified_since: Lsn(3), + }, dbnode: 7, }), ]; for msg in messages { let bytes = msg.serialize(); - let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap(); + let reconstructed = + PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3) + .unwrap(); assert!(msg == reconstructed); } } diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index dd49d4d5e7..49b1d9dc87 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -115,13 +115,15 @@ fn default_max_keys_per_list_response() -> Option { } fn default_azure_conn_pool_size() -> usize { - // Conservative default: no connection pooling. At time of writing this is the Azure - // SDK's default as well, due to historic reports of hard-to-reproduce issues + // By default, the Azure SDK does no connection pooling, due to historic reports of hard-to-reproduce issues // (https://github.com/hyperium/hyper/issues/2312) // // However, using connection pooling is important to avoid exhausting client ports when // doing huge numbers of requests (https://github.com/neondatabase/cloud/issues/20971) - 0 + // + // We therefore enable a modest pool size by default: this may be configured to zero if + // issues like the alleged upstream hyper issue appear. + 8 } impl Debug for S3Config { diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs index c4aad53cdb..818d759eac 100644 --- a/libs/tracing-utils/src/lib.rs +++ b/libs/tracing-utils/src/lib.rs @@ -38,7 +38,6 @@ pub mod http; use opentelemetry::trace::TracerProvider; use opentelemetry::KeyValue; -use opentelemetry_sdk::Resource; use tracing::Subscriber; use tracing_subscriber::registry::LookupSpan; use tracing_subscriber::Layer; @@ -121,7 +120,10 @@ where S: Subscriber + for<'span> LookupSpan<'span>, { // Sets up exporter from the OTEL_EXPORTER_* environment variables. - let exporter = opentelemetry_otlp::new_exporter().http(); + let exporter = opentelemetry_otlp::SpanExporter::builder() + .with_http() + .build() + .expect("could not initialize opentelemetry exporter"); // TODO: opentelemetry::global::set_error_handler() with custom handler that // bypasses default tracing layers, but logs regular looking log @@ -132,17 +134,13 @@ where opentelemetry_sdk::propagation::TraceContextPropagator::new(), ); - let tracer = opentelemetry_otlp::new_pipeline() - .tracing() - .with_exporter(exporter) - .with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource( - Resource::new(vec![KeyValue::new( - opentelemetry_semantic_conventions::resource::SERVICE_NAME, - service_name, - )]), - )) - .install_batch(opentelemetry_sdk::runtime::Tokio) - .expect("could not initialize opentelemetry exporter") + let tracer = opentelemetry_sdk::trace::TracerProvider::builder() + .with_batch_exporter(exporter, opentelemetry_sdk::runtime::Tokio) + .with_resource(opentelemetry_sdk::Resource::new(vec![KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_NAME, + service_name, + )])) + .build() .tracer("global"); tracing_opentelemetry::layer().with_tracer(tracer) diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 02bf77760a..edb451a02c 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -26,6 +26,7 @@ git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true hyper0 = { workspace = true, features = ["full"] } +inferno.workspace = true itertools.workspace = true fail.workspace = true futures = { workspace = true } diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 9b37b69939..4b4aa88d6b 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -417,6 +417,7 @@ pub async fn profile_heap_handler(req: Request) -> Result, enum Format { Jemalloc, Pprof, + Svg, } // Parameters. @@ -424,9 +425,24 @@ pub async fn profile_heap_handler(req: Request) -> Result, None => Format::Pprof, Some("jemalloc") => Format::Jemalloc, Some("pprof") => Format::Pprof, + Some("svg") => Format::Svg, Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))), }; + // Functions and mappings to strip when symbolizing pprof profiles. If true, + // also remove child frames. + static STRIP_FUNCTIONS: Lazy> = Lazy::new(|| { + vec![ + (Regex::new("^__rust").unwrap(), false), + (Regex::new("^_start$").unwrap(), false), + (Regex::new("^irallocx_prof").unwrap(), true), + (Regex::new("^prof_alloc_prep").unwrap(), true), + (Regex::new("^std::rt::lang_start").unwrap(), false), + (Regex::new("^std::sys::backtrace::__rust").unwrap(), false), + ] + }); + const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"]; + // Obtain profiler handle. let mut prof_ctl = jemalloc_pprof::PROF_CTL .as_ref() @@ -464,24 +480,9 @@ pub async fn profile_heap_handler(req: Request) -> Result, // Symbolize the profile. // TODO: consider moving this upstream to jemalloc_pprof and avoiding the // serialization roundtrip. - static STRIP_FUNCTIONS: Lazy> = Lazy::new(|| { - // Functions to strip from profiles. If true, also remove child frames. - vec![ - (Regex::new("^__rust").unwrap(), false), - (Regex::new("^_start$").unwrap(), false), - (Regex::new("^irallocx_prof").unwrap(), true), - (Regex::new("^prof_alloc_prep").unwrap(), true), - (Regex::new("^std::rt::lang_start").unwrap(), false), - (Regex::new("^std::sys::backtrace::__rust").unwrap(), false), - ] - }); let profile = pprof::decode(&bytes)?; let profile = pprof::symbolize(profile)?; - let profile = pprof::strip_locations( - profile, - &["libc", "libgcc", "pthread", "vdso"], - &STRIP_FUNCTIONS, - ); + let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS); pprof::encode(&profile) }) .await @@ -494,6 +495,27 @@ pub async fn profile_heap_handler(req: Request) -> Result, .body(Body::from(data)) .map_err(|err| ApiError::InternalServerError(err.into())) } + + Format::Svg => { + let body = tokio::task::spawn_blocking(move || { + let bytes = prof_ctl.dump_pprof()?; + let profile = pprof::decode(&bytes)?; + let profile = pprof::symbolize(profile)?; + let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS); + let mut opts = inferno::flamegraph::Options::default(); + opts.title = "Heap inuse".to_string(); + opts.count_name = "bytes".to_string(); + pprof::flamegraph(profile, &mut opts) + }) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; + Response::builder() + .status(200) + .header(CONTENT_TYPE, "image/svg+xml") + .body(Body::from(body)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } } } diff --git a/libs/utils/src/pprof.rs b/libs/utils/src/pprof.rs index 90910897bf..dd57f9ed4b 100644 --- a/libs/utils/src/pprof.rs +++ b/libs/utils/src/pprof.rs @@ -1,8 +1,9 @@ +use anyhow::bail; use flate2::write::{GzDecoder, GzEncoder}; use flate2::Compression; use itertools::Itertools as _; use once_cell::sync::Lazy; -use pprof::protos::{Function, Line, Message as _, Profile}; +use pprof::protos::{Function, Line, Location, Message as _, Profile}; use regex::Regex; use std::borrow::Cow; @@ -188,3 +189,59 @@ pub fn strip_locations( profile } + +/// Generates an SVG flamegraph from a symbolized pprof profile. +pub fn flamegraph( + profile: Profile, + opts: &mut inferno::flamegraph::Options, +) -> anyhow::Result> { + if profile.mapping.iter().any(|m| !m.has_functions) { + bail!("profile not symbolized"); + } + + // Index locations, functions, and strings. + let locations: HashMap = + profile.location.into_iter().map(|l| (l.id, l)).collect(); + let functions: HashMap = + profile.function.into_iter().map(|f| (f.id, f)).collect(); + let strings = profile.string_table; + + // Resolve stacks as function names, and sum sample values per stack. Also reverse the stack, + // since inferno expects it bottom-up. + let mut stacks: HashMap, i64> = HashMap::new(); + for sample in profile.sample { + let mut stack = Vec::with_capacity(sample.location_id.len()); + for location in sample.location_id.into_iter().rev() { + let Some(location) = locations.get(&location) else { + bail!("missing location {location}"); + }; + for line in location.line.iter().rev() { + let Some(function) = functions.get(&line.function_id) else { + bail!("missing function {}", line.function_id); + }; + let Some(name) = strings.get(function.name as usize) else { + bail!("missing string {}", function.name); + }; + stack.push(name.as_str()); + } + } + let Some(&value) = sample.value.first() else { + bail!("missing value"); + }; + *stacks.entry(stack).or_default() += value; + } + + // Construct stack lines for inferno. + let lines = stacks + .into_iter() + .map(|(stack, value)| (stack.into_iter().join(";"), value)) + .map(|(stack, value)| format!("{stack} {value}")) + .sorted() + .collect_vec(); + + // Construct the flamegraph. + let mut bytes = Vec::new(); + let lines = lines.iter().map(|line| line.as_str()); + inferno::flamegraph::from_lines(opts, lines, &mut bytes)?; + Ok(bytes) +} diff --git a/libs/utils/src/sync/spsc_fold.rs b/libs/utils/src/sync/spsc_fold.rs index b44f766ef0..0cab291d51 100644 --- a/libs/utils/src/sync/spsc_fold.rs +++ b/libs/utils/src/sync/spsc_fold.rs @@ -96,7 +96,11 @@ impl Sender { } } State::SenderWaitsForReceiverToConsume(_data) => { - // Really, we shouldn't be polled until receiver has consumed and wakes us. + // SAFETY: send is single threaded due to `&mut self` requirement, + // therefore register is not concurrent. + unsafe { + self.state.wake_sender.register(cx.waker()); + } Poll::Pending } State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)), @@ -449,4 +453,38 @@ mod tests { let err = recv_task.await.unwrap().expect_err("should error"); assert!(matches!(err, RecvError::SenderGone)); } + + #[tokio::test(start_paused = true)] + async fn test_receiver_drop_while_waiting_for_receiver_to_consume_unblocks_sender() { + let (mut sender, receiver) = channel(); + + let state = receiver.state.clone(); + + sender.send((), |_, _| unreachable!()).await.unwrap(); + + assert!(matches!(&*state.value.lock().unwrap(), &State::HasData(_))); + + let unmergeable = sender.send((), |_, _| Err(())); + let mut unmergeable = std::pin::pin!(unmergeable); + tokio::select! { + _ = tokio::time::sleep(FOREVER) => {}, + _ = &mut unmergeable => { + panic!("unmergeable should not complete"); + }, + } + + assert!(matches!( + &*state.value.lock().unwrap(), + &State::SenderWaitsForReceiverToConsume(_) + )); + + drop(receiver); + + assert!(matches!( + &*state.value.lock().unwrap(), + &State::ReceiverGone + )); + + unmergeable.await.unwrap_err(); + } } diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs index af22de5d95..6576dd0eba 100644 --- a/libs/wal_decoder/src/models.rs +++ b/libs/wal_decoder/src/models.rs @@ -95,6 +95,14 @@ impl InterpretedWalRecord { && self.metadata_record.is_none() && matches!(self.flush_uncommitted, FlushUncommittedRecords::No) } + + /// Checks if the WAL record is observed (i.e. contains only metadata + /// for observed values) + pub fn is_observed(&self) -> bool { + self.batch.is_observed() + && self.metadata_record.is_none() + && matches!(self.flush_uncommitted, FlushUncommittedRecords::No) + } } /// The interpreted part of the Postgres WAL record which requires metadata diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs index 41294da7a0..af2b179e05 100644 --- a/libs/wal_decoder/src/serialized_batch.rs +++ b/libs/wal_decoder/src/serialized_batch.rs @@ -501,6 +501,11 @@ impl SerializedValueBatch { !self.has_data() && self.metadata.is_empty() } + /// Checks if the batch contains only observed values + pub fn is_observed(&self) -> bool { + !self.has_data() && !self.metadata.is_empty() + } + /// Checks if the batch contains data /// /// Note that if this returns false, it may still contain observed values or diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index f9507fc47a..207ec4166c 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -60,7 +60,7 @@ impl Client { ) -> anyhow::Result { let copy_both: tokio_postgres::CopyBothDuplex = self .client - .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}")) + .copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}")) .await?; let Client { cancel_on_client_drop, diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index b2df01714d..9f3984f1bd 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -2,7 +2,7 @@ use anyhow::Context; use camino::Utf8PathBuf; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpaceAccum; -use pageserver_api::models::PagestreamGetPageRequest; +use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest}; use pageserver_api::shard::TenantShardId; use tokio_util::sync::CancellationToken; @@ -322,12 +322,15 @@ async fn main_impl( .to_rel_block() .expect("we filter non-rel-block keys out above"); PagestreamGetPageRequest { - request_lsn: if rng.gen_bool(args.req_latest_probability) { - Lsn::MAX - } else { - r.timeline_lsn + hdr: PagestreamRequest { + reqid: 0, + request_lsn: if rng.gen_bool(args.req_latest_probability) { + Lsn::MAX + } else { + r.timeline_lsn + }, + not_modified_since: r.timeline_lsn, }, - not_modified_since: r.timeline_lsn, rel: rel_tag, blkno: block_no, } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index b92ff4ebf9..567a69da3b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -53,12 +53,10 @@ project_build_tag!(BUILD_TAG); #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). -// TODO: disabled because concurrent CPU profiles cause seg faults. See: -// https://github.com/neondatabase/neon/issues/10225. -//#[allow(non_upper_case_globals)] -//#[export_name = "malloc_conf"] -//pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; +/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; const PID_FILE_NAME: &str = "pageserver.pid"; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index b4e20cb8b9..a313a64080 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1854,6 +1854,7 @@ pub(crate) static LIVE_CONNECTIONS: Lazy = Lazy::new(|| { #[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)] pub(crate) enum ComputeCommandKind { + PageStreamV3, PageStreamV2, Basebackup, Fullbackup, @@ -2337,13 +2338,15 @@ macro_rules! redo_bytes_histogram_count_buckets { pub(crate) struct WalIngestMetrics { pub(crate) bytes_received: IntCounter, pub(crate) records_received: IntCounter, + pub(crate) records_observed: IntCounter, pub(crate) records_committed: IntCounter, pub(crate) records_filtered: IntCounter, pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter, pub(crate) clear_vm_bits_unknown: IntCounterVec, } -pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { +pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| { + WalIngestMetrics { bytes_received: register_int_counter!( "pageserver_wal_ingest_bytes_received", "Bytes of WAL ingested from safekeepers", @@ -2354,6 +2357,11 @@ pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMet "Number of WAL records received from safekeepers" ) .expect("failed to define a metric"), + records_observed: register_int_counter!( + "pageserver_wal_ingest_records_observed", + "Number of WAL records observed from safekeepers. These are metadata only records for shard 0." + ) + .expect("failed to define a metric"), records_committed: register_int_counter!( "pageserver_wal_ingest_records_committed", "Number of WAL records which resulted in writes to pageserver storage" @@ -2375,6 +2383,7 @@ pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMet &["entity"], ) .expect("failed to define a metric"), +} }); pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy = Lazy::new(|| { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d00ec11a76..f6504bd3b5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -17,7 +17,7 @@ use pageserver_api::models::{ PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, - PagestreamProtocolVersion, + PagestreamProtocolVersion, PagestreamRequest, }; use pageserver_api::shard::TenantShardId; use postgres_backend::{ @@ -67,7 +67,7 @@ use crate::tenant::PageReconstructError; use crate::tenant::Timeline; use crate::{basebackup, timed_after_cancellation}; use pageserver_api::key::rel_block_to_key; -use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; +use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; @@ -537,6 +537,23 @@ impl From for QueryError { } } +#[derive(thiserror::Error, Debug)] +struct BatchedPageStreamError { + req: PagestreamRequest, + err: PageStreamError, +} + +impl std::fmt::Display for BatchedPageStreamError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.err.fmt(f) + } +} + +struct BatchedGetPageRequest { + req: PagestreamGetPageRequest, + timer: SmgrOpTimer, +} + enum BatchedFeMessage { Exists { span: Span, @@ -554,7 +571,7 @@ enum BatchedFeMessage { span: Span, shard: timeline::handle::Handle, effective_request_lsn: Lsn, - pages: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>, + pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>, }, DbSize { span: Span, @@ -570,7 +587,7 @@ enum BatchedFeMessage { }, RespondError { span: Span, - error: PageStreamError, + error: BatchedPageStreamError, }, } @@ -595,12 +612,15 @@ impl BatchedFeMessage { BatchedFeMessage::GetPage { shard, pages, .. } => ( shard, pages.len(), - itertools::Either::Right(pages.iter_mut().map(|(_, _, timer)| timer)), + itertools::Either::Right(pages.iter_mut().map(|p| &mut p.timer)), ), BatchedFeMessage::RespondError { .. } => return Ok(()), }; let throttled = tokio::select! { throttled = shard.pagestream_throttle.throttle(tokens) => { throttled } + _ = shard.cancel.cancelled() => { + return Err(QueryError::Shutdown); + } _ = cancel.cancelled() => { return Err(QueryError::Shutdown); } @@ -654,6 +674,7 @@ impl PageServerHandler { ) } + #[allow(clippy::too_many_arguments)] async fn pagestream_read_message( pgb: &mut PostgresBackendReader, tenant_id: TenantId, @@ -661,6 +682,7 @@ impl PageServerHandler { timeline_handles: &mut TimelineHandles, cancel: &CancellationToken, ctx: &RequestContext, + protocol_version: PagestreamProtocolVersion, parent_span: Span, ) -> Result, QueryError> where @@ -695,11 +717,12 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message"); // parse request - let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; + let neon_fe_msg = + PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?; let batched_msg = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn); + let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) .instrument(span.clone()) // sets `shard_id` field @@ -715,7 +738,7 @@ impl PageServerHandler { } } PagestreamFeMessage::Nblocks(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn); + let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) .instrument(span.clone()) // sets `shard_id` field @@ -731,7 +754,7 @@ impl PageServerHandler { } } PagestreamFeMessage::DbSize(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn); + let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) .instrument(span.clone()) // sets `shard_id` field @@ -747,7 +770,7 @@ impl PageServerHandler { } } PagestreamFeMessage::GetSlruSegment(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn); + let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) .instrument(span.clone()) // sets `shard_id` field @@ -762,25 +785,23 @@ impl PageServerHandler { req, } } - PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - request_lsn, - not_modified_since, - rel, - blkno, - }) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %request_lsn); + PagestreamFeMessage::GetPage(req) => { + let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %req.hdr.request_lsn); macro_rules! respond_error { ($error:expr) => {{ let error = BatchedFeMessage::RespondError { span, - error: $error, + error: BatchedPageStreamError { + req: req.hdr, + err: $error, + }, }; Ok(Some(error)) }}; } - let key = rel_block_to_key(rel, blkno); + let key = rel_block_to_key(req.rel, req.blkno); let shard = match timeline_handles .get(tenant_id, timeline_id, ShardSelector::Page(key)) .instrument(span.clone()) // sets `shard_id` field @@ -814,8 +835,8 @@ impl PageServerHandler { let effective_request_lsn = match Self::wait_or_get_last_lsn( &shard, - request_lsn, - not_modified_since, + req.hdr.request_lsn, + req.hdr.not_modified_since, &shard.get_latest_gc_cutoff_lsn(), ctx, ) @@ -831,7 +852,7 @@ impl PageServerHandler { span, shard, effective_request_lsn, - pages: smallvec::smallvec![(rel, blkno, timer)], + pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }], } } }; @@ -910,6 +931,7 @@ impl PageServerHandler { pgb_writer: &mut PostgresBackend, batch: BatchedFeMessage, cancel: &CancellationToken, + protocol_version: PagestreamProtocolVersion, ctx: &RequestContext, ) -> Result<(), QueryError> where @@ -917,7 +939,7 @@ impl PageServerHandler { { // invoke handler function let (handler_results, span): ( - Vec>, + Vec>, _, ) = match batch { BatchedFeMessage::Exists { @@ -932,7 +954,8 @@ impl PageServerHandler { .handle_get_rel_exists_request(&shard, &req, ctx) .instrument(span.clone()) .await - .map(|msg| (msg, timer))], + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], span, ) } @@ -948,7 +971,8 @@ impl PageServerHandler { .handle_get_nblocks_request(&shard, &req, ctx) .instrument(span.clone()) .await - .map(|msg| (msg, timer))], + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], span, ) } @@ -990,7 +1014,8 @@ impl PageServerHandler { .handle_db_size_request(&shard, &req, ctx) .instrument(span.clone()) .await - .map(|msg| (msg, timer))], + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], span, ) } @@ -1006,7 +1031,8 @@ impl PageServerHandler { .handle_get_slru_segment_request(&shard, &req, ctx) .instrument(span.clone()) .await - .map(|msg| (msg, timer))], + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr })], span, ) } @@ -1022,7 +1048,7 @@ impl PageServerHandler { // Other handler errors are sent back as an error message and we stay in pagestream protocol. for handler_result in handler_results { let (response_msg, timer) = match handler_result { - Err(e) => match &e { + Err(e) => match &e.err { PageStreamError::Shutdown => { // If we fail to fulfil a request during shutdown, which may be _because_ of // shutdown, then do not send the error to the client. Instead just drop the @@ -1041,13 +1067,14 @@ impl PageServerHandler { // print the all details to the log with {:#}, but for the client the // error message is enough. Do not log if shutting down, as the anyhow::Error // here includes cancellation which is not an error. - let full = utils::error::report_compact_sources(&e); + let full = utils::error::report_compact_sources(&e.err); span.in_scope(|| { error!("error reading relation or page version: {full:#}") }); ( PagestreamBeMessage::Error(PagestreamErrorResponse { - message: e.to_string(), + req: e.req, + message: e.err.to_string(), }), None, // TODO: measure errors ) @@ -1060,7 +1087,9 @@ impl PageServerHandler { // marshal & transmit response message // - pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; + pgb_writer.write_message_noflush(&BeMessage::CopyData( + &response_msg.serialize(protocol_version), + ))?; // We purposefully don't count flush time into the timer. // @@ -1123,7 +1152,7 @@ impl PageServerHandler { pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, - _protocol_version: PagestreamProtocolVersion, + protocol_version: PagestreamProtocolVersion, ctx: RequestContext, ) -> Result<(), QueryError> where @@ -1163,6 +1192,7 @@ impl PageServerHandler { timeline_handles, request_span, pipelining_config, + protocol_version, &ctx, ) .await @@ -1175,6 +1205,7 @@ impl PageServerHandler { timeline_id, timeline_handles, request_span, + protocol_version, &ctx, ) .await @@ -1201,6 +1232,7 @@ impl PageServerHandler { timeline_id: TimelineId, mut timeline_handles: TimelineHandles, request_span: Span, + protocol_version: PagestreamProtocolVersion, ctx: &RequestContext, ) -> ( (PostgresBackendReader, TimelineHandles), @@ -1218,6 +1250,7 @@ impl PageServerHandler { &mut timeline_handles, &cancel, ctx, + protocol_version, request_span.clone(), ) .await; @@ -1238,7 +1271,7 @@ impl PageServerHandler { } let err = self - .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx) + .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx) .await; match err { Ok(()) => {} @@ -1261,6 +1294,7 @@ impl PageServerHandler { mut timeline_handles: TimelineHandles, request_span: Span, pipelining_config: PageServicePipeliningConfigPipelined, + protocol_version: PagestreamProtocolVersion, ctx: &RequestContext, ) -> ( (PostgresBackendReader, TimelineHandles), @@ -1358,6 +1392,7 @@ impl PageServerHandler { &mut timeline_handles, &cancel_batcher, &ctx, + protocol_version, request_span.clone(), ) .await; @@ -1403,8 +1438,14 @@ impl PageServerHandler { batch .throttle_and_record_start_processing(&self.cancel) .await?; - self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx) - .await?; + self.pagesteam_handle_batched_message( + pgb_writer, + batch, + &cancel, + protocol_version, + &ctx, + ) + .await?; } } }); @@ -1578,8 +1619,8 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, - req.request_lsn, - req.not_modified_since, + req.hdr.request_lsn, + req.hdr.not_modified_since, &latest_gc_cutoff_lsn, ctx, ) @@ -1590,6 +1631,7 @@ impl PageServerHandler { .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { + req: *req, exists, })) } @@ -1604,8 +1646,8 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, - req.request_lsn, - req.not_modified_since, + req.hdr.request_lsn, + req.hdr.not_modified_since, &latest_gc_cutoff_lsn, ctx, ) @@ -1616,6 +1658,7 @@ impl PageServerHandler { .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { + req: *req, n_blocks, })) } @@ -1630,8 +1673,8 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, - req.request_lsn, - req.not_modified_since, + req.hdr.request_lsn, + req.hdr.not_modified_since, &latest_gc_cutoff_lsn, ctx, ) @@ -1643,6 +1686,7 @@ impl PageServerHandler { let db_size = total_blocks as i64 * BLCKSZ as i64; Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse { + req: *req, db_size, })) } @@ -1652,9 +1696,9 @@ impl PageServerHandler { &mut self, timeline: &Timeline, effective_lsn: Lsn, - requests: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>, + requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>, ctx: &RequestContext, - ) -> Vec> { + ) -> Vec> { debug_assert_current_span_has_tenant_and_timeline_id(); timeline @@ -1663,7 +1707,7 @@ impl PageServerHandler { let results = timeline .get_rel_page_at_lsn_batched( - requests.iter().map(|(reltag, blkno, _)| (reltag, blkno)), + requests.iter().map(|p| (&p.req.rel, &p.req.blkno)), effective_lsn, ctx, ) @@ -1675,16 +1719,20 @@ impl PageServerHandler { requests .into_iter() .zip(results.into_iter()) - .map(|((_, _, timer), res)| { + .map(|(req, res)| { res.map(|page| { ( PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { + req: req.req, page, }), - timer, + req.timer, ) }) - .map_err(PageStreamError::from) + .map_err(|e| BatchedPageStreamError { + err: PageStreamError::from(e), + req: req.req.hdr, + }) }), ) } @@ -1699,8 +1747,8 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, - req.request_lsn, - req.not_modified_since, + req.hdr.request_lsn, + req.hdr.not_modified_since, &latest_gc_cutoff_lsn, ctx, ) @@ -1711,7 +1759,7 @@ impl PageServerHandler { let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?; Ok(PagestreamBeMessage::GetSlruSegment( - PagestreamGetSlruSegmentResponse { segment }, + PagestreamGetSlruSegmentResponse { req: *req, segment }, )) } @@ -1906,6 +1954,7 @@ struct FullBackupCmd { struct PageStreamCmd { tenant_id: TenantId, timeline_id: TimelineId, + protocol_version: PagestreamProtocolVersion, } /// `lease lsn tenant timeline lsn` @@ -1926,7 +1975,7 @@ enum PageServiceCmd { } impl PageStreamCmd { - fn parse(query: &str) -> anyhow::Result { + fn parse(query: &str, protocol_version: PagestreamProtocolVersion) -> anyhow::Result { let parameters = query.split_whitespace().collect_vec(); if parameters.len() != 2 { bail!( @@ -1941,6 +1990,7 @@ impl PageStreamCmd { Ok(Self { tenant_id, timeline_id, + protocol_version, }) } } @@ -2078,7 +2128,14 @@ impl PageServiceCmd { bail!("cannot parse query: {query}") }; match cmd.to_ascii_lowercase().as_str() { - "pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(other)?)), + "pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse( + other, + PagestreamProtocolVersion::V2, + )?)), + "pagestream_v3" => Ok(Self::PageStream(PageStreamCmd::parse( + other, + PagestreamProtocolVersion::V3, + )?)), "basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)), "fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)), "lease" => { @@ -2160,25 +2217,21 @@ where PageServiceCmd::PageStream(PageStreamCmd { tenant_id, timeline_id, + protocol_version, }) => { tracing::Span::current() .record("tenant_id", field::display(tenant_id)) .record("timeline_id", field::display(timeline_id)); self.check_permission(Some(tenant_id))?; + let command_kind = match protocol_version { + PagestreamProtocolVersion::V2 => ComputeCommandKind::PageStreamV2, + PagestreamProtocolVersion::V3 => ComputeCommandKind::PageStreamV3, + }; + COMPUTE_COMMANDS_COUNTERS.for_command(command_kind).inc(); - COMPUTE_COMMANDS_COUNTERS - .for_command(ComputeCommandKind::PageStreamV2) - .inc(); - - self.handle_pagerequests( - pgb, - tenant_id, - timeline_id, - PagestreamProtocolVersion::V2, - ctx, - ) - .await?; + self.handle_pagerequests(pgb, tenant_id, timeline_id, protocol_version, ctx) + .await?; } PageServiceCmd::BaseBackup(BaseBackupCmd { tenant_id, @@ -2357,7 +2410,8 @@ mod tests { cmd, PageServiceCmd::PageStream(PageStreamCmd { tenant_id, - timeline_id + timeline_id, + protocol_version: PagestreamProtocolVersion::V2, }) ); let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap(); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 14c7e0d2f8..b65fe6cf7c 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -627,7 +627,7 @@ impl Timeline { // cannot overflow, high and low are both smaller than u64::MAX / 2 let mid = (high + low) / 2; - let cmp = self + let cmp = match self .is_latest_commit_timestamp_ge_than( search_timestamp, Lsn(mid * 8), @@ -635,7 +635,16 @@ impl Timeline { &mut found_larger, ctx, ) - .await?; + .await + { + Ok(res) => res, + Err(PageReconstructError::MissingKey(e)) => { + warn!("Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", e); + // Return that we didn't find any requests smaller than the LSN, and logging the error. + return Ok(LsnForTimestamp::Past(min_lsn)); + } + Err(e) => return Err(e), + }; if cmp { high = mid; @@ -643,6 +652,7 @@ impl Timeline { low = mid + 1; } } + // If `found_smaller == true`, `low = t + 1` where `t` is the target LSN, // so the LSN of the last commit record before or at `search_timestamp`. // Remove one from `low` to get `t`. diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e3dab2fc1d..8e61d09de7 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -48,6 +48,7 @@ use timeline::compaction::GcCompactJob; use timeline::compaction::ScheduledCompactionTask; use timeline::import_pgdata; use timeline::offload::offload_timeline; +use timeline::offload::OffloadError; use timeline::CompactFlags; use timeline::CompactOptions; use timeline::CompactionError; @@ -2039,7 +2040,7 @@ impl Tenant { ) -> Result, TimelineArchivalError> { info!("unoffloading timeline"); - // We activate the timeline below manually, so this must be called on an active timeline. + // We activate the timeline below manually, so this must be called on an active tenant. // We expect callers of this function to ensure this. match self.current_state() { TenantState::Activating { .. } @@ -3100,9 +3101,17 @@ impl Tenant { }; has_pending_task |= pending_task_left.unwrap_or(false); if pending_task_left == Some(false) && *can_offload { - offload_timeline(self, timeline) + pausable_failpoint!("before-timeline-auto-offload"); + match offload_timeline(self, timeline) .instrument(info_span!("offload_timeline", %timeline_id)) - .await?; + .await + { + Err(OffloadError::NotArchived) => { + // Ignore this, we likely raced with unarchival + Ok(()) + } + other => other, + }?; } } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index b27ac3e933..813111245d 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -304,6 +304,15 @@ pub enum WaitCompletionError { #[derive(Debug, thiserror::Error)] #[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")] pub struct UploadQueueNotReadyError; + +#[derive(Debug, thiserror::Error)] +pub enum ShutdownIfArchivedError { + #[error(transparent)] + NotInitialized(NotInitialized), + #[error("timeline is not archived")] + NotArchived, +} + /// Behavioral modes that enable seamless live migration. /// /// See docs/rfcs/028-pageserver-migration.md to understand how these fit in. @@ -816,6 +825,55 @@ impl RemoteTimelineClient { Ok(need_wait) } + /// Shuts the timeline client down, but only if the timeline is archived. + /// + /// This function and [`Self::schedule_index_upload_for_timeline_archival_state`] use the + /// same lock to prevent races between unarchival and offloading: unarchival requires the + /// upload queue to be initialized, and leaves behind an upload queue where either dirty + /// or clean has archived_at of `None`. offloading leaves behind an uninitialized upload + /// queue. + pub(crate) async fn shutdown_if_archived( + self: &Arc, + ) -> Result<(), ShutdownIfArchivedError> { + { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard + .initialized_mut() + .map_err(ShutdownIfArchivedError::NotInitialized)?; + + match ( + upload_queue.dirty.archived_at.is_none(), + upload_queue.clean.0.archived_at.is_none(), + ) { + // The expected case: the timeline is archived and we don't want to unarchive + (false, false) => {} + (true, false) => { + tracing::info!("can't shut down timeline: timeline slated for unarchival"); + return Err(ShutdownIfArchivedError::NotArchived); + } + (dirty_archived, true) => { + tracing::info!(%dirty_archived, "can't shut down timeline: timeline not archived in remote storage"); + return Err(ShutdownIfArchivedError::NotArchived); + } + } + + // Set the shutting_down flag while the guard from the archival check is held. + // This prevents a race with unarchival, as initialized_mut will not return + // an upload queue from this point. + // Also launch the queued tasks like shutdown() does. + if !upload_queue.shutting_down { + upload_queue.shutting_down = true; + upload_queue.queued_operations.push_back(UploadOp::Shutdown); + // this operation is not counted similar to Barrier + self.launch_queued_tasks(upload_queue); + } + } + + self.shutdown().await; + + Ok(()) + } + /// Launch an index-file upload operation in the background, setting `import_pgdata` field. pub(crate) fn schedule_index_upload_for_import_pgdata_state_update( self: &Arc, diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 47a93b19d2..ae44af3fad 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -194,7 +194,9 @@ impl DeleteTimelineFlow { super::debug_assert_current_span_has_tenant_and_timeline_id(); let allow_offloaded_children = false; - let (timeline, mut guard) = Self::prepare(tenant, timeline_id, allow_offloaded_children)?; + let set_stopping = true; + let (timeline, mut guard) = + Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?; guard.mark_in_progress()?; @@ -334,6 +336,7 @@ impl DeleteTimelineFlow { tenant: &Tenant, timeline_id: TimelineId, allow_offloaded_children: bool, + set_stopping: bool, ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> { // Note the interaction between this guard and deletion guard. // Here we attempt to lock deletion guard when we're holding a lock on timelines. @@ -389,8 +392,10 @@ impl DeleteTimelineFlow { } }; - if let TimelineOrOffloaded::Timeline(timeline) = &timeline { - timeline.set_state(TimelineState::Stopping); + if set_stopping { + if let TimelineOrOffloaded::Timeline(timeline) = &timeline { + timeline.set_state(TimelineState::Stopping); + } } Ok((timeline, delete_lock_guard)) diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 15628a9645..6c6b19e8b1 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -1,10 +1,11 @@ use std::sync::Arc; -use pageserver_api::models::TenantState; +use pageserver_api::models::{TenantState, TimelineState}; use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard}; use super::Timeline; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::remote_timeline_client::ShutdownIfArchivedError; use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded}; #[derive(thiserror::Error, Debug)] @@ -36,28 +37,29 @@ pub(crate) async fn offload_timeline( tracing::info!("offloading archived timeline"); let allow_offloaded_children = true; - let (timeline, guard) = - DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children) - .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; + let set_stopping = false; + let (timeline, guard) = DeleteTimelineFlow::prepare( + tenant, + timeline.timeline_id, + allow_offloaded_children, + set_stopping, + ) + .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; let TimelineOrOffloaded::Timeline(timeline) = timeline else { tracing::error!("timeline already offloaded, but given timeline object"); return Ok(()); }; - let is_archived = timeline.is_archived(); - match is_archived { - Some(true) => (), - Some(false) => { - tracing::warn!("tried offloading a non-archived timeline"); - return Err(OffloadError::NotArchived); - } - None => { - // This is legal: calls to this function can race with the timeline shutting down - tracing::info!("tried offloading a timeline whose remote storage is not initialized"); - return Err(OffloadError::Cancelled); + match timeline.remote_client.shutdown_if_archived().await { + Ok(()) => {} + Err(ShutdownIfArchivedError::NotInitialized(_)) => { + // Either the timeline is being deleted, the operation is being retried, or we are shutting down. + // Don't return cancelled here to keep it idempotent. } + Err(ShutdownIfArchivedError::NotArchived) => return Err(OffloadError::NotArchived), } + timeline.set_state(TimelineState::Stopping); // Now that the Timeline is in Stopping state, request all the related tasks to shut down. timeline.shutdown(super::ShutdownMode::Reload).await; diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 3f10eeda60..d74faa1af5 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -319,27 +319,11 @@ pub(super) async fn handle_walreceiver_connection( return Ok(()); } - async fn commit( - modification: &mut DatadirModification<'_>, - uncommitted: &mut u64, - filtered: &mut u64, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - WAL_INGEST - .records_committed - .inc_by(*uncommitted - *filtered); - modification.commit(ctx).await?; - *uncommitted = 0; - *filtered = 0; - Ok(()) - } - let status_update = match replication_message { ReplicationMessage::RawInterpretedWalRecords(raw) => { WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64); let mut uncommitted_records = 0; - let mut filtered_records = 0; // This is the end LSN of the raw WAL from which the records // were interpreted. @@ -380,31 +364,23 @@ pub(super) async fn handle_walreceiver_connection( if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) && uncommitted_records > 0 { - commit( - &mut modification, - &mut uncommitted_records, - &mut filtered_records, - &ctx, - ) - .await?; + modification.commit(&ctx).await?; + uncommitted_records = 0; } let local_next_record_lsn = interpreted.next_record_lsn; - let ingested = walingest + + if interpreted.is_observed() { + WAL_INGEST.records_observed.inc(); + } + + walingest .ingest_record(interpreted, &mut modification, &ctx) .await .with_context(|| { format!("could not ingest record at {local_next_record_lsn}") })?; - if !ingested { - tracing::debug!( - "ingest: filtered out record @ LSN {local_next_record_lsn}" - ); - WAL_INGEST.records_filtered.inc(); - filtered_records += 1; - } - uncommitted_records += 1; // FIXME: this cannot be made pausable_failpoint without fixing the @@ -418,13 +394,8 @@ pub(super) async fn handle_walreceiver_connection( || modification.approx_pending_bytes() > DatadirModification::MAX_PENDING_BYTES { - commit( - &mut modification, - &mut uncommitted_records, - &mut filtered_records, - &ctx, - ) - .await?; + modification.commit(&ctx).await?; + uncommitted_records = 0; } } @@ -442,13 +413,7 @@ pub(super) async fn handle_walreceiver_connection( if uncommitted_records > 0 || needs_last_record_lsn_advance { // Commit any uncommitted records - commit( - &mut modification, - &mut uncommitted_records, - &mut filtered_records, - &ctx, - ) - .await?; + modification.commit(&ctx).await?; } if !caught_up && streaming_lsn >= end_of_wal { @@ -469,6 +434,21 @@ pub(super) async fn handle_walreceiver_connection( } ReplicationMessage::XLogData(xlog_data) => { + async fn commit( + modification: &mut DatadirModification<'_>, + uncommitted: &mut u64, + filtered: &mut u64, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + WAL_INGEST + .records_committed + .inc_by(*uncommitted - *filtered); + modification.commit(ctx).await?; + *uncommitted = 0; + *filtered = 0; + Ok(()) + } + // Pass the WAL data to the decoder, and see if we can decode // more records as a result. let data = xlog_data.data(); diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index fa2a570ea8..769befb4e5 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -556,6 +556,9 @@ pageserver_connect(shardno_t shard_no, int elevel) switch (neon_protocol_version) { + case 3: + pagestream_query = psprintf("pagestream_v3 %s %s", neon_tenant, neon_timeline); + break; case 2: pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline); break; @@ -1135,9 +1138,9 @@ pg_init_libpagestore(void) "Version of compute<->page server protocol", NULL, &neon_protocol_version, - 2, /* use protocol version 2 */ - 2, /* min */ - 2, /* max */ + 2, /* use protocol version 2 */ + 2, /* min */ + 3, /* max */ PGC_SU_BACKEND, 0, /* no flags required */ NULL, NULL, NULL); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index f905e3b0fa..37bc4f7886 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -44,10 +44,15 @@ typedef enum T_NeonGetSlruSegmentResponse, } NeonMessageTag; +typedef uint64 NeonRequestId; + /* base struct for c-style inheritance */ typedef struct { NeonMessageTag tag; + NeonRequestId reqid; + XLogRecPtr lsn; + XLogRecPtr not_modified_since; } NeonMessage; #define messageTag(m) (((const NeonMessage *)(m))->tag) @@ -67,6 +72,7 @@ typedef enum { SLRU_MULTIXACT_OFFSETS } SlruKind; + /*-- * supertype of all the Neon*Request structs below. * @@ -87,37 +93,37 @@ typedef enum { * * These structs describe the V2 of these requests. (The old now-defunct V1 * protocol contained just one LSN and a boolean 'latest' flag.) + * + * V3 version of protocol adds request ID to all requests. This request ID is also included in response + * as well as other fields from requests, which allows to verify that we receive response for our request. + * We copy fields from request to response to make checking more reliable: request ID is formed from process ID + * and local counter, so in principle there can be duplicated requests IDs if process PID is reused. */ -typedef struct -{ - NeonMessageTag tag; - XLogRecPtr lsn; - XLogRecPtr not_modified_since; -} NeonRequest; +typedef NeonMessage NeonRequest; typedef struct { - NeonRequest req; + NeonRequest hdr; NRelFileInfo rinfo; ForkNumber forknum; } NeonExistsRequest; typedef struct { - NeonRequest req; + NeonRequest hdr; NRelFileInfo rinfo; ForkNumber forknum; } NeonNblocksRequest; typedef struct { - NeonRequest req; + NeonRequest hdr; Oid dbNode; } NeonDbSizeRequest; typedef struct { - NeonRequest req; + NeonRequest hdr; NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blkno; @@ -125,32 +131,29 @@ typedef struct typedef struct { - NeonRequest req; - SlruKind kind; - int segno; + NeonRequest hdr; + SlruKind kind; + int segno; } NeonGetSlruSegmentRequest; /* supertype of all the Neon*Response structs below */ -typedef struct -{ - NeonMessageTag tag; -} NeonResponse; +typedef NeonMessage NeonResponse; typedef struct { - NeonMessageTag tag; + NeonExistsRequest req; bool exists; } NeonExistsResponse; typedef struct { - NeonMessageTag tag; + NeonNblocksRequest req; uint32 n_blocks; } NeonNblocksResponse; typedef struct { - NeonMessageTag tag; + NeonGetPageRequest req; char page[FLEXIBLE_ARRAY_MEMBER]; } NeonGetPageResponse; @@ -158,21 +161,21 @@ typedef struct typedef struct { - NeonMessageTag tag; + NeonDbSizeRequest req; int64 db_size; } NeonDbSizeResponse; typedef struct { - NeonMessageTag tag; + NeonResponse req; char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error * message */ } NeonErrorResponse; typedef struct { - NeonMessageTag tag; - int n_blocks; + NeonGetSlruSegmentRequest req; + int n_blocks; char data[BLCKSZ * SLRU_PAGES_PER_SEGMENT]; } NeonGetSlruSegmentResponse; diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index b733807026..7a4c0ef487 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -120,6 +120,9 @@ static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); +static uint32 local_request_counter; +#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter) + /* * Prefetch implementation: * @@ -188,15 +191,11 @@ typedef struct PrefetchRequest uint8 status; /* see PrefetchStatus for valid values */ uint8 flags; /* see PrefetchRequestFlags */ neon_request_lsns request_lsns; + NeonRequestId reqid; NeonResponse *response; /* may be null */ uint64 my_ring_index; } PrefetchRequest; -StaticAssertDecl(sizeof(PrefetchRequest) == 64, - "We prefer to have a power-of-2 size for this struct. Please" - " try to find an alternative solution before reaching to" - " increase the expected size here"); - /* prefetch buffer lookup hash table */ typedef struct PrfHashEntry @@ -365,6 +364,7 @@ compact_prefetch_buffers(void) target_slot->shard_no = source_slot->shard_no; target_slot->status = source_slot->status; target_slot->response = source_slot->response; + target_slot->reqid = source_slot->reqid; target_slot->request_lsns = source_slot->request_lsns; target_slot->my_ring_index = empty_ring_index; @@ -798,7 +798,8 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns uint64 mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index; NeonGetPageRequest request = { - .req.tag = T_NeonGetPageRequest, + .hdr.tag = T_NeonGetPageRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), /* lsn and not_modified_since are filled in below */ .rinfo = BufTagGetNRelFileInfo(slot->buftag), .forknum = slot->buftag.forkNum, @@ -807,14 +808,16 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns Assert(mySlotNo == MyPState->ring_unused); + slot->reqid = request.hdr.reqid; + if (force_request_lsns) slot->request_lsns = *force_request_lsns; else neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, &slot->request_lsns, 1, NULL); - request.req.lsn = slot->request_lsns.request_lsn; - request.req.not_modified_since = slot->request_lsns.not_modified_since; + request.hdr.lsn = slot->request_lsns.request_lsn; + request.hdr.not_modified_since = slot->request_lsns.not_modified_since; Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_unused); @@ -1102,6 +1105,12 @@ Retry: return min_ring_index; } +static bool +equal_requests(NeonRequest* a, NeonRequest* b) +{ + return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since; +} + /* * Note: this function can get canceled and use a long jump to the next catch @@ -1184,6 +1193,10 @@ nm_pack_request(NeonRequest *msg) initStringInfo(&s); pq_sendbyte(&s, msg->tag); + if (neon_protocol_version >= 3) + { + pq_sendint64(&s, msg->reqid); + } pq_sendint64(&s, msg->lsn); pq_sendint64(&s, msg->not_modified_since); @@ -1261,8 +1274,16 @@ NeonResponse * nm_unpack_response(StringInfo s) { NeonMessageTag tag = pq_getmsgbyte(s); + NeonResponse resp_hdr = {0}; /* make valgrind happy */ NeonResponse *resp = NULL; + resp_hdr.tag = tag; + if (neon_protocol_version >= 3) + { + resp_hdr.reqid = pq_getmsgint64(s); + resp_hdr.lsn = pq_getmsgint64(s); + resp_hdr.not_modified_since = pq_getmsgint64(s); + } switch (tag) { /* pagestore -> pagestore_client */ @@ -1270,7 +1291,14 @@ nm_unpack_response(StringInfo s) { NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse)); - msg_resp->tag = tag; + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + } + msg_resp->req.hdr = resp_hdr; msg_resp->exists = pq_getmsgbyte(s); pq_getmsgend(s); @@ -1282,7 +1310,14 @@ nm_unpack_response(StringInfo s) { NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse)); - msg_resp->tag = tag; + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + } + msg_resp->req.hdr = resp_hdr; msg_resp->n_blocks = pq_getmsgint(s, 4); pq_getmsgend(s); @@ -1295,12 +1330,20 @@ nm_unpack_response(StringInfo s) NeonGetPageResponse *msg_resp; msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE); - msg_resp->tag = tag; + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + msg_resp->req.blkno = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; /* XXX: should be varlena */ memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); pq_getmsgend(s); - Assert(msg_resp->tag == T_NeonGetPageResponse); + Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse); resp = (NeonResponse *) msg_resp; break; @@ -1310,7 +1353,11 @@ nm_unpack_response(StringInfo s) { NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse)); - msg_resp->tag = tag; + if (neon_protocol_version >= 3) + { + msg_resp->req.dbNode = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; msg_resp->db_size = pq_getmsgint64(s); pq_getmsgend(s); @@ -1328,7 +1375,7 @@ nm_unpack_response(StringInfo s) msglen = strlen(msgtext); msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1); - msg_resp->tag = tag; + msg_resp->req = resp_hdr; memcpy(msg_resp->message, msgtext, msglen + 1); pq_getmsgend(s); @@ -1339,9 +1386,17 @@ nm_unpack_response(StringInfo s) case T_NeonGetSlruSegmentResponse: { NeonGetSlruSegmentResponse *msg_resp; - int n_blocks = pq_getmsgint(s, 4); - msg_resp = palloc(sizeof(NeonGetSlruSegmentResponse)); - msg_resp->tag = tag; + int n_blocks; + msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse)); + + if (neon_protocol_version >= 3) + { + msg_resp->req.kind = pq_getmsgbyte(s); + msg_resp->req.segno = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; + + n_blocks = pq_getmsgint(s, 4); msg_resp->n_blocks = n_blocks; memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ); pq_getmsgend(s); @@ -1386,8 +1441,8 @@ nm_to_string(NeonMessage *msg) appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1399,8 +1454,8 @@ nm_to_string(NeonMessage *msg) appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1413,8 +1468,8 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1424,8 +1479,8 @@ nm_to_string(NeonMessage *msg) appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1436,8 +1491,8 @@ nm_to_string(NeonMessage *msg) appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\""); appendStringInfo(&s, ", \"kind\": %u", msg_req->kind); appendStringInfo(&s, ", \"segno\": %u", msg_req->segno); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -2312,39 +2367,64 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); { NeonExistsRequest request = { - .req.tag = T_NeonExistsRequest, - .req.lsn = request_lsns.request_lsn, - .req.not_modified_since = request_lsns.not_modified_since, + .hdr.tag = T_NeonExistsRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns.request_lsn, + .hdr.not_modified_since = request_lsns.not_modified_since, .rinfo = InfoFromSMgrRel(reln), .forknum = forkNum }; resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonExistsResponse: + { + NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + !RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) || + exists_resp->req.forknum != request.forknum) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum); + } + } + exists = exists_resp->exists; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", + resp->reqid, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", + T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); + } + pfree(resp); } - - switch (resp->tag) - { - case T_NeonExistsResponse: - exists = ((NeonExistsResponse *) resp)->exists; - break; - - case T_NeonErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", - T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); - } - pfree(resp); return exists; } @@ -2952,15 +3032,43 @@ Retry: switch (resp->tag) { case T_NeonGetPageResponse: - memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); + { + NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp; + if (neon_protocol_version >= 3) + { + if (resp->reqid != slot->reqid || + resp->lsn != slot->request_lsns.request_lsn || + resp->not_modified_since != slot->request_lsns.not_modified_since || + !RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) || + getpage_resp->req.forknum != forkNum || + getpage_resp->req.blkno != base_blockno + i) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno, + slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i); + } + } + memcpy(buffer, getpage_resp->page, BLCKSZ); lfc_write(rinfo, forkNum, blockno, buffer); break; - + } case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (resp->reqid != slot->reqid || + resp->lsn != slot->request_lsns.request_lsn || + resp->not_modified_since != slot->request_lsns.not_modified_since) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)); + } + } ereport(ERROR, (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", - slot->shard_no, blockno, RelFileInfoFmt(rinfo), + errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo), forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); @@ -3443,47 +3551,72 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) { NeonNblocksRequest request = { - .req.tag = T_NeonNblocksRequest, - .req.lsn = request_lsns.request_lsn, - .req.not_modified_since = request_lsns.not_modified_since, + .hdr.tag = T_NeonNblocksRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns.request_lsn, + .hdr.not_modified_since = request_lsns.not_modified_since, .rinfo = InfoFromSMgrRel(reln), .forknum = forknum, }; resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonNblocksResponse: + { + NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + !RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) || + relsize_resp->req.forknum != forknum) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum); + } + } + n_blocks = relsize_resp->n_blocks; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", + resp->reqid, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", + T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); + } + update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); + + neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), + n_blocks); + + pfree(resp); } - - switch (resp->tag) - { - case T_NeonNblocksResponse: - n_blocks = ((NeonNblocksResponse *) resp)->n_blocks; - break; - - case T_NeonErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", - T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); - } - update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); - - neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), - n_blocks); - - pfree(resp); return n_blocks; } @@ -3503,40 +3636,64 @@ neon_dbsize(Oid dbNode) { NeonDbSizeRequest request = { - .req.tag = T_NeonDbSizeRequest, - .req.lsn = request_lsns.request_lsn, - .req.not_modified_since = request_lsns.not_modified_since, + .hdr.tag = T_NeonDbSizeRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns.request_lsn, + .hdr.not_modified_since = request_lsns.not_modified_since, .dbNode = dbNode, }; resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonDbSizeResponse: + { + NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + dbsize_resp->req.dbNode != dbNode) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode); + } + } + db_size = dbsize_resp->db_size; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X", + resp->reqid, + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", + T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); + } + + neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); + + pfree(resp); } - - switch (resp->tag) - { - case T_NeonDbSizeResponse: - db_size = ((NeonDbSizeResponse *) resp)->db_size; - break; - - case T_NeonErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X", - dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", - T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); - } - - neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", - dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); - - pfree(resp); return db_size; } @@ -3868,16 +4025,17 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf return -1; request = (NeonGetSlruSegmentRequest) { - .req.tag = T_NeonGetSlruSegmentRequest, - .req.lsn = request_lsn, - .req.not_modified_since = not_modified_since, + .hdr.tag = T_NeonGetSlruSegmentRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsn, + .hdr.not_modified_since = not_modified_since, .kind = kind, .segno = segno }; do { - while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no)); + while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no)); consume_prefetch_responses(); @@ -3887,14 +4045,38 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf switch (resp->tag) { case T_NeonGetSlruSegmentResponse: - n_blocks = ((NeonGetSlruSegmentResponse *) resp)->n_blocks; - memcpy(buffer, ((NeonGetSlruSegmentResponse *) resp)->data, n_blocks*BLCKSZ); + { + NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + slru_resp->req.kind != kind || + slru_resp->req.segno != segno) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, segno); + } + } + n_blocks = slru_resp->n_blocks; + memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ); break; - + } case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } ereport(ERROR, (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "could not read SLRU %d segment %d at lsn %X/%08X", + errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %d at lsn %X/%08X", + resp->reqid, kind, segno, LSN_FORMAT_ARGS(request_lsn)), @@ -4033,8 +4215,9 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, NeonResponse *response; NeonNblocksResponse *nbresponse; NeonNblocksRequest request = { - .req = (NeonRequest) { + .hdr = (NeonRequest) { .tag = T_NeonNblocksRequest, + .reqid = GENERATE_REQUEST_ID(), .lsn = end_recptr, .not_modified_since = end_recptr, }, diff --git a/poetry.lock b/poetry.lock index 5f15223dca..2cd2bc6383 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2028,13 +2028,13 @@ openapi-schema-validator = ">=0.4.2,<0.5.0" [[package]] name = "packaging" -version = "23.0" +version = "24.2" description = "Core utilities for Python packages" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"}, - {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, + {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, + {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, ] [[package]] diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index c3de77b352..1cbf91d3ae 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -1,3 +1,5 @@ +use std::fmt; + use async_trait::async_trait; use postgres_client::config::SslMode; use pq_proto::BeMessage as Be; @@ -5,15 +7,19 @@ use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, info_span}; -use super::ComputeCredentialKeys; +use super::{ComputeCredentialKeys, ControlPlaneApi}; +use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo}; use crate::auth::IpPattern; use crate::cache::Cached; use crate::config::AuthenticationConfig; use crate::context::RequestContext; +use crate::control_plane::client::cplane_proxy_v1; use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::{ReportableError, UserFacingError}; use crate::proxy::connect_compute::ComputeConnectBackend; +use crate::proxy::NeonOptions; use crate::stream::PqStream; +use crate::types::RoleName; use crate::{auth, compute, waiters}; #[derive(Debug, Error)] @@ -31,6 +37,13 @@ pub(crate) enum ConsoleRedirectError { #[derive(Debug)] pub struct ConsoleRedirectBackend { console_uri: reqwest::Url, + api: cplane_proxy_v1::NeonControlPlaneClient, +} + +impl fmt::Debug for cplane_proxy_v1::NeonControlPlaneClient { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "NeonControlPlaneClient") + } } impl UserFacingError for ConsoleRedirectError { @@ -71,9 +84,24 @@ pub(crate) fn new_psql_session_id() -> String { hex::encode(rand::random::<[u8; 8]>()) } +#[async_trait] +impl BackendIpAllowlist for ConsoleRedirectBackend { + async fn get_allowed_ips( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> auth::Result> { + self.api + .get_allowed_ips_and_secret(ctx, user_info) + .await + .map(|(ips, _)| ips.as_ref().clone()) + .map_err(|e| e.into()) + } +} + impl ConsoleRedirectBackend { - pub fn new(console_uri: reqwest::Url) -> Self { - Self { console_uri } + pub fn new(console_uri: reqwest::Url, api: cplane_proxy_v1::NeonControlPlaneClient) -> Self { + Self { console_uri, api } } pub(crate) async fn authenticate( @@ -81,10 +109,16 @@ impl ConsoleRedirectBackend { ctx: &RequestContext, auth_config: &'static AuthenticationConfig, client: &mut PqStream, - ) -> auth::Result<(ConsoleRedirectNodeInfo, Option>)> { + ) -> auth::Result<( + ConsoleRedirectNodeInfo, + ComputeUserInfo, + Option>, + )> { authenticate(ctx, auth_config, &self.console_uri, client) .await - .map(|(node_info, ip_allowlist)| (ConsoleRedirectNodeInfo(node_info), ip_allowlist)) + .map(|(node_info, user_info, ip_allowlist)| { + (ConsoleRedirectNodeInfo(node_info), user_info, ip_allowlist) + }) } } @@ -109,7 +143,7 @@ async fn authenticate( auth_config: &'static AuthenticationConfig, link_uri: &reqwest::Url, client: &mut PqStream, -) -> auth::Result<(NodeInfo, Option>)> { +) -> auth::Result<(NodeInfo, ComputeUserInfo, Option>)> { ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect); // registering waiter can fail if we get unlucky with rng. @@ -164,8 +198,15 @@ async fn authenticate( let mut config = compute::ConnCfg::new(db_info.host.to_string(), db_info.port); config.dbname(&db_info.dbname).user(&db_info.user); + let user: RoleName = db_info.user.into(); + let user_info = ComputeUserInfo { + endpoint: db_info.aux.endpoint_id.as_str().into(), + user: user.clone(), + options: NeonOptions::default(), + }; + ctx.set_dbname(db_info.dbname.into()); - ctx.set_user(db_info.user.into()); + ctx.set_user(user); ctx.set_project(db_info.aux.clone()); info!("woken up a compute node"); @@ -188,6 +229,7 @@ async fn authenticate( config, aux: db_info.aux, }, + user_info, db_info.allowed_ips, )) } diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 0c9a7f7825..de48be2952 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -16,7 +16,9 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; -use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint}; +use crate::auth::{ + self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint, IpPattern, +}; use crate::cache::Cached; use crate::config::AuthenticationConfig; use crate::context::RequestContext; @@ -131,7 +133,7 @@ pub(crate) struct ComputeUserInfoNoEndpoint { pub(crate) options: NeonOptions, } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub(crate) struct ComputeUserInfo { pub(crate) endpoint: EndpointId, pub(crate) user: RoleName, @@ -244,6 +246,15 @@ impl AuthenticationConfig { } } +#[async_trait::async_trait] +pub(crate) trait BackendIpAllowlist { + async fn get_allowed_ips( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> auth::Result>; +} + /// True to its name, this function encapsulates our current auth trade-offs. /// Here, we choose the appropriate auth flow based on circumstances. /// @@ -256,7 +267,7 @@ async fn auth_quirks( allow_cleartext: bool, config: &'static AuthenticationConfig, endpoint_rate_limiter: Arc, -) -> auth::Result { +) -> auth::Result<(ComputeCredentials, Option>)> { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the endpoint (project) name. // We now expect to see a very specific payload in the place of password. @@ -315,7 +326,7 @@ async fn auth_quirks( ) .await { - Ok(keys) => Ok(keys), + Ok(keys) => Ok((keys, Some(allowed_ips.as_ref().clone()))), Err(e) => { if e.is_password_failed() { // The password could have been changed, so we invalidate the cache. @@ -385,7 +396,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { allow_cleartext: bool, config: &'static AuthenticationConfig, endpoint_rate_limiter: Arc, - ) -> auth::Result> { + ) -> auth::Result<(Backend<'a, ComputeCredentials>, Option>)> { let res = match self { Self::ControlPlane(api, user_info) => { debug!( @@ -394,7 +405,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { "performing authentication using the console" ); - let credentials = auth_quirks( + let (credentials, ip_allowlist) = auth_quirks( ctx, &*api, user_info, @@ -404,7 +415,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { endpoint_rate_limiter, ) .await?; - Backend::ControlPlane(api, credentials) + Ok((Backend::ControlPlane(api, credentials), ip_allowlist)) } Self::Local(_) => { return Err(auth::AuthError::bad_auth_method("invalid for local proxy")) @@ -413,7 +424,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { // TODO: replace with some metric info!("user successfully authenticated"); - Ok(res) + res } } @@ -441,6 +452,24 @@ impl Backend<'_, ComputeUserInfo> { } } +#[async_trait::async_trait] +impl BackendIpAllowlist for Backend<'_, ()> { + async fn get_allowed_ips( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> auth::Result> { + let auth_data = match self { + Self::ControlPlane(api, ()) => api.get_allowed_ips_and_secret(ctx, user_info).await, + Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), + }; + + auth_data + .map(|(ips, _)| ips.as_ref().clone()) + .map_err(|e| e.into()) + } +} + #[async_trait::async_trait] impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { async fn wake_compute( @@ -786,7 +815,7 @@ mod tests { .await .unwrap(); - assert_eq!(creds.info.endpoint, "my-endpoint"); + assert_eq!(creds.0.info.endpoint, "my-endpoint"); handle.await.unwrap(); } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 3b122d771c..70b50436bf 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -744,9 +744,59 @@ fn build_auth_backend( } AuthBackendType::ConsoleRedirect => { - let url = args.uri.parse()?; - let backend = ConsoleRedirectBackend::new(url); + let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; + let project_info_cache_config: ProjectInfoCacheOptions = + args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; + info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); + info!( + "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" + ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); + let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( + wake_compute_cache_config, + project_info_cache_config, + endpoint_cache_config, + ))); + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.wake_compute_lock.parse()?; + info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); + let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( + "wake_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + )?)); + + let url = args.uri.clone().parse()?; + let ep_url: proxy::url::ApiUrl = args.auth_endpoint.parse()?; + let endpoint = http::Endpoint::new(ep_url, http::new_client()); + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); + + // Since we use only get_allowed_ips_and_secret() wake_compute_endpoint_rate_limiter + // and locks are not used in ConsoleRedirectBackend, + // but they are required by the NeonControlPlaneClient + let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( + endpoint, + args.control_plane_token.clone(), + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); + + let backend = ConsoleRedirectBackend::new(url, api); let config = Box::leak(Box::new(backend)); Ok(Either::Right(config)) diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index df618cf242..a96c43f2ce 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -12,8 +12,10 @@ use tokio::sync::Mutex; use tracing::{debug, info}; use uuid::Uuid; -use crate::auth::{check_peer_addr_is_in_list, IpPattern}; +use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo}; +use crate::auth::{check_peer_addr_is_in_list, AuthError, IpPattern}; use crate::config::ComputeConfig; +use crate::context::RequestContext; use crate::error::ReportableError; use crate::ext::LockExt; use crate::metrics::{CancellationRequest, CancellationSource, Metrics}; @@ -56,6 +58,9 @@ pub(crate) enum CancelError { #[error("IP is not allowed")] IpNotAllowed, + + #[error("Authentication backend error")] + AuthError(#[from] AuthError), } impl ReportableError for CancelError { @@ -68,6 +73,7 @@ impl ReportableError for CancelError { CancelError::Postgres(_) => crate::error::ErrorKind::Compute, CancelError::RateLimit => crate::error::ErrorKind::RateLimit, CancelError::IpNotAllowed => crate::error::ErrorKind::User, + CancelError::AuthError(_) => crate::error::ErrorKind::ControlPlane, } } } @@ -102,10 +108,7 @@ impl CancellationHandler

{ } } - /// Try to cancel a running query for the corresponding connection. - /// If the cancellation key is not found, it will be published to Redis. - /// check_allowed - if true, check if the IP is allowed to cancel the query - /// return Result primarily for tests + /// Cancelling only in notification, will be removed pub(crate) async fn cancel_session( &self, key: CancelKeyData, @@ -134,7 +137,8 @@ impl CancellationHandler

{ } // NB: we should immediately release the lock after cloning the token. - let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { + let cancel_state = self.map.get(&key).and_then(|x| x.clone()); + let Some(cancel_closure) = cancel_state else { tracing::warn!("query cancellation key not found: {key}"); Metrics::get() .proxy @@ -185,6 +189,96 @@ impl CancellationHandler

{ cancel_closure.try_cancel_query(self.compute_config).await } + /// Try to cancel a running query for the corresponding connection. + /// If the cancellation key is not found, it will be published to Redis. + /// check_allowed - if true, check if the IP is allowed to cancel the query. + /// Will fetch IP allowlist internally. + /// + /// return Result primarily for tests + pub(crate) async fn cancel_session_auth( + &self, + key: CancelKeyData, + ctx: RequestContext, + check_allowed: bool, + auth_backend: &T, + ) -> Result<(), CancelError> { + // TODO: check for unspecified address is only for backward compatibility, should be removed + if !ctx.peer_addr().is_unspecified() { + let subnet_key = match ctx.peer_addr() { + IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here + IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()), + }; + if !self.limiter.lock_propagate_poison().check(subnet_key, 1) { + // log only the subnet part of the IP address to know which subnet is rate limited + tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}"); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::RateLimitExceeded, + }); + return Err(CancelError::RateLimit); + } + } + + // NB: we should immediately release the lock after cloning the token. + let cancel_state = self.map.get(&key).and_then(|x| x.clone()); + let Some(cancel_closure) = cancel_state else { + tracing::warn!("query cancellation key not found: {key}"); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::NotFound, + }); + + if ctx.session_id() == Uuid::nil() { + // was already published, do not publish it again + return Ok(()); + } + + match self + .client + .try_publish(key, ctx.session_id(), ctx.peer_addr()) + .await + { + Ok(()) => {} // do nothing + Err(e) => { + // log it here since cancel_session could be spawned in a task + tracing::error!("failed to publish cancellation key: {key}, error: {e}"); + return Err(CancelError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + e.to_string(), + ))); + } + } + return Ok(()); + }; + + let ip_allowlist = auth_backend + .get_allowed_ips(&ctx, &cancel_closure.user_info) + .await + .map_err(CancelError::AuthError)?; + + if check_allowed && !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) { + // log it here since cancel_session could be spawned in a task + tracing::warn!("IP is not allowed to cancel the query: {key}"); + return Err(CancelError::IpNotAllowed); + } + + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::Found, + }); + info!("cancelling query per user's request using key {key}"); + cancel_closure.try_cancel_query(self.compute_config).await + } + #[cfg(test)] fn contains(&self, session: &Session

) -> bool { self.map.contains_key(&session.key) @@ -248,6 +342,7 @@ pub struct CancelClosure { cancel_token: CancelToken, ip_allowlist: Vec, hostname: String, // for pg_sni router + user_info: ComputeUserInfo, } impl CancelClosure { @@ -256,12 +351,14 @@ impl CancelClosure { cancel_token: CancelToken, ip_allowlist: Vec, hostname: String, + user_info: ComputeUserInfo, ) -> Self { Self { socket_addr, cancel_token, ip_allowlist, hostname, + user_info, } } /// Cancels the query running on user's compute node. @@ -288,6 +385,8 @@ impl CancelClosure { debug!("query was cancelled"); Ok(()) } + + /// Obsolete (will be removed after moving CancelMap to Redis), only for notifications pub(crate) fn set_ip_allowlist(&mut self, ip_allowlist: Vec) { self.ip_allowlist = ip_allowlist; } diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 89de6692ad..aff796bbab 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -13,6 +13,7 @@ use thiserror::Error; use tokio::net::TcpStream; use tracing::{debug, error, info, warn}; +use crate::auth::backend::ComputeUserInfo; use crate::auth::parse_endpoint_param; use crate::cancellation::CancelClosure; use crate::config::ComputeConfig; @@ -250,6 +251,7 @@ impl ConnCfg { ctx: &RequestContext, aux: MetricsAuxInfo, config: &ComputeConfig, + user_info: ComputeUserInfo, ) -> Result { let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (socket_addr, stream, host) = self.connect_raw(config.timeout).await?; @@ -294,8 +296,9 @@ impl ConnCfg { process_id, secret_key, }, - vec![], + vec![], // TODO: deprecated, will be removed host.to_string(), + user_info, ); let connection = PostgresConnection { diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 25a549039c..0c6755063f 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -159,6 +159,7 @@ pub(crate) async fn handle_client( let request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); + let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let do_handshake = handshake(ctx, stream, tls, record_handshake_error); @@ -171,23 +172,20 @@ pub(crate) async fn handle_client( // spawn a task to cancel the session, but don't wait for it cancellations.spawn({ let cancellation_handler_clone = Arc::clone(&cancellation_handler); - let session_id = ctx.session_id(); - let peer_ip = ctx.peer_addr(); - let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id); + let ctx = ctx.clone(); + let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id()); cancel_span.follows_from(tracing::Span::current()); async move { - drop( - cancellation_handler_clone - .cancel_session( - cancel_key_data, - session_id, - peer_ip, - config.authentication_config.ip_allowlist_check_enabled, - ) - .instrument(cancel_span) - .await, - ); - } + cancellation_handler_clone + .cancel_session_auth( + cancel_key_data, + ctx, + config.authentication_config.ip_allowlist_check_enabled, + backend, + ) + .await + .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok(); + }.instrument(cancel_span) }); return Ok(None); @@ -197,7 +195,7 @@ pub(crate) async fn handle_client( ctx.set_db_options(params.clone()); - let (user_info, ip_allowlist) = match backend + let (node_info, user_info, ip_allowlist) = match backend .authenticate(ctx, &config.authentication_config, &mut stream) .await { @@ -210,11 +208,12 @@ pub(crate) async fn handle_client( let mut node = connect_to_compute( ctx, &TcpMechanism { + user_info, params_compat: true, params: ¶ms, locks: &config.connect_compute_locks, }, - &user_info, + &node_info, config.wake_compute_retry_config, &config.connect_to_compute, ) diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index 00038a6ac6..ece03156d1 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -29,7 +29,7 @@ use crate::rate_limiter::WakeComputeRateLimiter; use crate::types::{EndpointCacheKey, EndpointId}; use crate::{compute, http, scram}; -const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); +pub(crate) const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); #[derive(Clone)] pub struct NeonControlPlaneClient { @@ -78,15 +78,30 @@ impl NeonControlPlaneClient { info!("endpoint is not valid, skipping the request"); return Ok(AuthInfo::default()); } - let request_id = ctx.session_id().to_string(); - let application_name = ctx.console_application_name(); + self.do_get_auth_req(user_info, &ctx.session_id(), Some(ctx)) + .await + } + + async fn do_get_auth_req( + &self, + user_info: &ComputeUserInfo, + session_id: &uuid::Uuid, + ctx: Option<&RequestContext>, + ) -> Result { + let request_id: String = session_id.to_string(); + let application_name = if let Some(ctx) = ctx { + ctx.console_application_name() + } else { + "auth_cancellation".to_string() + }; + async { let request = self .endpoint .get_path("get_endpoint_access_control") .header(X_REQUEST_ID, &request_id) .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) - .query(&[("session_id", ctx.session_id())]) + .query(&[("session_id", session_id)]) .query(&[ ("application_name", application_name.as_str()), ("endpointish", user_info.endpoint.as_str()), @@ -96,9 +111,16 @@ impl NeonControlPlaneClient { debug!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); - let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); - let response = self.endpoint.execute(request).await?; - drop(pause); + let response = match ctx { + Some(ctx) => { + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); + let rsp = self.endpoint.execute(request).await; + drop(pause); + rsp? + } + None => self.endpoint.execute(request).await?, + }; + info!(duration = ?start.elapsed(), "received http response"); let body = match parse_body::(response).await { Ok(body) => body, diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index c65041df0e..1dca26d686 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -74,8 +74,11 @@ impl NodeInfo { &self, ctx: &RequestContext, config: &ComputeConfig, + user_info: ComputeUserInfo, ) -> Result { - self.config.connect(ctx, self.aux.clone(), config).await + self.config + .connect(ctx, self.aux.clone(), config, user_info) + .await } pub(crate) fn reuse_settings(&mut self, other: Self) { diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 8a80494860..dd145e6bb2 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -4,7 +4,7 @@ use tokio::time; use tracing::{debug, info, warn}; use super::retry::ShouldRetryWakeCompute; -use crate::auth::backend::ComputeCredentialKeys; +use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT}; use crate::config::{ComputeConfig, RetryConfig}; use crate::context::RequestContext; @@ -71,6 +71,8 @@ pub(crate) struct TcpMechanism<'a> { /// connect_to_compute concurrency lock pub(crate) locks: &'static ApiLocks, + + pub(crate) user_info: ComputeUserInfo, } #[async_trait] @@ -88,7 +90,7 @@ impl ConnectMechanism for TcpMechanism<'_> { ) -> Result { let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; - permit.release_result(node_info.connect(ctx, config).await) + permit.release_result(node_info.connect(ctx, config, self.user_info.clone()).await) } fn update_connect_config(&self, config: &mut compute::ConnCfg) { diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 3926c56fec..63f93f0a91 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -273,23 +273,20 @@ pub(crate) async fn handle_client( // spawn a task to cancel the session, but don't wait for it cancellations.spawn({ let cancellation_handler_clone = Arc::clone(&cancellation_handler); - let session_id = ctx.session_id(); - let peer_ip = ctx.peer_addr(); - let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id); + let ctx = ctx.clone(); + let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id()); cancel_span.follows_from(tracing::Span::current()); async move { - drop( - cancellation_handler_clone - .cancel_session( - cancel_key_data, - session_id, - peer_ip, - config.authentication_config.ip_allowlist_check_enabled, - ) - .instrument(cancel_span) - .await, - ); - } + cancellation_handler_clone + .cancel_session_auth( + cancel_key_data, + ctx, + config.authentication_config.ip_allowlist_check_enabled, + auth_backend, + ) + .await + .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok(); + }.instrument(cancel_span) }); return Ok(None); @@ -315,7 +312,7 @@ pub(crate) async fn handle_client( }; let user = user_info.get_user().to_owned(); - let user_info = match user_info + let (user_info, ip_allowlist) = match user_info .authenticate( ctx, &mut stream, @@ -335,16 +332,19 @@ pub(crate) async fn handle_client( } }; - let params_compat = match &user_info { - auth::Backend::ControlPlane(_, info) => { - info.info.options.get(NeonOptions::PARAMS_COMPAT).is_some() - } - auth::Backend::Local(_) => false, + let compute_user_info = match &user_info { + auth::Backend::ControlPlane(_, info) => &info.info, + auth::Backend::Local(_) => unreachable!("local proxy does not run tcp proxy service"), }; + let params_compat = compute_user_info + .options + .get(NeonOptions::PARAMS_COMPAT) + .is_some(); let mut node = connect_to_compute( ctx, &TcpMechanism { + user_info: compute_user_info.clone(), params_compat, params: ¶ms, locks: &config.connect_compute_locks, @@ -356,6 +356,8 @@ pub(crate) async fn handle_client( .or_else(|e| stream.throw_error(e)) .await?; + node.cancel_closure + .set_ip_allowlist(ip_allowlist.unwrap_or_default()); let session = cancellation_handler.get_session(); prepare_client_connection(&node, &session, &mut stream).await?; diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 4383d6be2c..63cdf6176c 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -37,7 +37,6 @@ struct NotificationHeader<'a> { #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] #[serde(tag = "topic", content = "data")] -// Message to contributors: Make sure to align these topic names with the list below. pub(crate) enum Notification { #[serde( rename = "/allowed_ips_updated", @@ -74,21 +73,13 @@ pub(crate) enum Notification { PasswordUpdate { password_update: PasswordUpdate }, #[serde(rename = "/cancel_session")] Cancel(CancelSession), -} -/// Returns true if the topic name given is a known topic that we can deserialize and action on. -/// Returns false otherwise. -fn known_topic(s: &str) -> bool { - // Message to contributors: Make sure to align these topic names with the enum above. - matches!( - s, - "/allowed_ips_updated" - | "/block_public_or_vpc_access_updated" - | "/allowed_vpc_endpoints_updated_for_org" - | "/allowed_vpc_endpoints_updated_for_projects" - | "/password_updated" - | "/cancel_session" - ) + #[serde( + other, + deserialize_with = "deserialize_unknown_topic", + skip_serializing + )] + UnknownTopic, } #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] @@ -136,6 +127,15 @@ where serde_json::from_str(&s).map_err(::custom) } +// https://github.com/serde-rs/serde/issues/1714 +fn deserialize_unknown_topic<'de, D>(deserializer: D) -> Result<(), D::Error> +where + D: serde::Deserializer<'de>, +{ + deserializer.deserialize_any(serde::de::IgnoredAny)?; + Ok(()) +} + struct MessageHandler { cache: Arc, cancellation_handler: Arc>, @@ -178,32 +178,29 @@ impl MessageHandler { let payload: String = msg.get_payload()?; tracing::debug!(?payload, "received a message payload"); - // For better error handling, we first parse the payload to extract the topic. - // If there's a topic we don't support, we can handle that error more gracefully. - let header: NotificationHeader = match serde_json::from_str(&payload) { - Ok(msg) => msg, - Err(e) => { - Metrics::get().proxy.redis_errors_total.inc(RedisErrors { - channel: msg.get_channel_name(), - }); - tracing::error!("broken message: {e}"); + let msg: Notification = match serde_json::from_str(&payload) { + Ok(Notification::UnknownTopic) => { + match serde_json::from_str::(&payload) { + // don't update the metric for redis errors if it's just a topic we don't know about. + Ok(header) => tracing::warn!(topic = header.topic, "unknown topic"), + Err(e) => { + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: msg.get_channel_name(), + }); + tracing::error!("broken message: {e}"); + } + }; return Ok(()); } - }; - - if !known_topic(header.topic) { - // don't update the metric for redis errors if it's just a topic we don't know about. - tracing::warn!(topic = header.topic, "unknown topic"); - return Ok(()); - } - - let msg: Notification = match serde_json::from_str(&payload) { Ok(msg) => msg, Err(e) => { Metrics::get().proxy.redis_errors_total.inc(RedisErrors { channel: msg.get_channel_name(), }); - tracing::error!(topic = header.topic, "broken message: {e}"); + match serde_json::from_str::(&payload) { + Ok(header) => tracing::error!(topic = header.topic, "broken message: {e}"), + Err(_) => tracing::error!("broken message: {e}"), + }; return Ok(()); } }; @@ -278,6 +275,8 @@ impl MessageHandler { invalidate_cache(cache, msg); }); } + + Notification::UnknownTopic => unreachable!(), } Ok(()) @@ -304,6 +303,7 @@ fn invalidate_cache(cache: Arc, msg: Notification) { Notification::AllowedVpcEndpointsUpdatedForProjects { .. } => { // https://github.com/neondatabase/neon/pull/10073 } + Notification::UnknownTopic => unreachable!(), } } @@ -471,4 +471,30 @@ mod tests { Ok(()) } + + #[test] + fn parse_unknown_topic() -> anyhow::Result<()> { + let with_data = json!({ + "type": "message", + "topic": "/doesnotexist", + "data": { + "payload": "ignored" + }, + "extra_fields": "something" + }) + .to_string(); + let result: Notification = serde_json::from_str(&with_data)?; + assert_eq!(result, Notification::UnknownTopic); + + let without_data = json!({ + "type": "message", + "topic": "/doesnotexist", + "extra_fields": "something" + }) + .to_string(); + let result: Notification = serde_json::from_str(&without_data)?; + assert_eq!(result, Notification::UnknownTopic); + + Ok(()) + } } diff --git a/rust-toolchain.toml b/rust-toolchain.toml index f0661a32e0..06746d3e1d 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.83.0" +channel = "1.84.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/spec/MCProposerAcceptorReconfig.tla b/safekeeper/spec/MCProposerAcceptorReconfig.tla new file mode 100644 index 0000000000..a4b25e383a --- /dev/null +++ b/safekeeper/spec/MCProposerAcceptorReconfig.tla @@ -0,0 +1,41 @@ +---- MODULE MCProposerAcceptorReconfig ---- +EXTENDS TLC, ProposerAcceptorReconfig + +\* Augments the spec with model checking constraints. + +\* It slightly duplicates MCProposerAcceptorStatic, but we can't EXTENDS it +\* because it EXTENDS ProposerAcceptorStatic in turn. The duplication isn't big +\* anyway. + +\* For model checking. +CONSTANTS + max_entries, \* model constraint: max log entries acceptor/proposer can hold + max_term, \* model constraint: max allowed term + max_generation \* mode constraint: max config generation + +ASSUME max_entries \in Nat /\ max_term \in Nat /\ max_generation \in Nat + +\* Model space constraint. +StateConstraint == /\ \A p \in proposers: + /\ prop_state[p].term <= max_term + /\ Len(prop_state[p].wal) <= max_entries + /\ conf_store.generation <= max_generation + +\* Sets of proposers and acceptors and symmetric because we don't take any +\* actions depending on some concrete proposer/acceptor (like IF p = p1 THEN +\* ...) +ProposerAcceptorSymmetry == Permutations(proposers) \union Permutations(acceptors) + +\* enforce order of the vars in the error trace with ALIAS +\* Note that ALIAS is supported only since version 1.8.0 which is pre-release +\* as of writing this. +Alias == [ + prop_state |-> prop_state, + prop_conf |-> prop_conf, + acc_state |-> acc_state, + acc_conf |-> acc_conf, + committed |-> committed, + conf_store |-> conf_store + ] + +==== diff --git a/safekeeper/spec/MCProposerAcceptorStatic.tla b/safekeeper/spec/MCProposerAcceptorStatic.tla index be3d99c697..b4eca1965a 100644 --- a/safekeeper/spec/MCProposerAcceptorStatic.tla +++ b/safekeeper/spec/MCProposerAcceptorStatic.tla @@ -3,6 +3,9 @@ EXTENDS TLC, ProposerAcceptorStatic \* Augments the spec with model checking constraints. +\* Note that MCProposerAcceptorReconfig duplicates it and might need to +\* be updated as well. + \* For model checking. CONSTANTS max_entries, \* model constraint: max log entries acceptor/proposer can hold diff --git a/safekeeper/spec/ProposerAcceptorReconfig.tla b/safekeeper/spec/ProposerAcceptorReconfig.tla new file mode 100644 index 0000000000..78de231a39 --- /dev/null +++ b/safekeeper/spec/ProposerAcceptorReconfig.tla @@ -0,0 +1,350 @@ +---- MODULE ProposerAcceptorReconfig ---- + +(* + Spec for https://github.com/neondatabase/neon/blob/538e2312a617c65d489d391892c70b2e4d7407b5/docs/rfcs/035-safekeeper-dynamic-membership-change.md + + Simplifications: + - The ones inherited from ProposerAcceptorStatic. + - We don't model transient state of the configuration change driver process + (storage controller in the implementation). Its actions StartChange and FinishChange + are taken based on the persistent state of safekeepers and conf store. The + justification for that is the following: once new configuration n is + created (e.g with StartChange or FinishChange), any old configuration + change driver working on older conf < n will never be able to commit + it to the conf store because it is protected by CAS. The + propagation of these older confs is still possible though, and + spec allows to do it through acceptors. + Plus the model is already pretty huge. + - Previous point also means that the FinishChange action is + based only on the current state of safekeepers, not from + the past. That's ok because while individual + acceptor may go down, + quorum one never does. So the FinishChange + condition which collects max of the quorum may get + only more strict over time. + + The invariants expectedly break if any of FinishChange + required conditions are removed. +*) + +EXTENDS Integers, Sequences, FiniteSets, TLC + +VARIABLES + \* state which is the same in the static spec + prop_state, + acc_state, + committed, + elected_history, + \* reconfiguration only state + prop_conf, \* prop_conf[p] is current configuration of proposer p + acc_conf, \* acc_conf[a] is current configuration of acceptor a + conf_store \* configuration in the configuration store. + +CONSTANT + acceptors, + proposers + +CONSTANT NULL + +\* Import ProposerAcceptorStatic under PAS. +\* +\* Note that all vars and consts are named the same and thus substituted +\* implicitly. +PAS == INSTANCE ProposerAcceptorStatic + +\******************************************************************************** +\* Helpers +\******************************************************************************** + +\******************************************************************************** +\* Type assertion +\******************************************************************************** + +\* Is c a valid config? +IsConfig(c) == + /\ DOMAIN c = {"generation", "members", "newMembers"} + \* Unique id of the configuration. + /\ c.generation \in Nat + /\ c.members \in SUBSET acceptors + \* newMembers is NULL when it is not a joint conf. + /\ \/ c.newMembers = NULL + \/ c.newMembers \in SUBSET acceptors + +TypeOk == + /\ PAS!TypeOk + /\ \A p \in proposers: IsConfig(prop_conf[p]) + /\ \A a \in acceptors: IsConfig(acc_conf[a]) + /\ IsConfig(conf_store) + +\******************************************************************************** +\* Initial +\******************************************************************************** + +Init == + /\ PAS!Init + /\ \E init_members \in SUBSET acceptors: + LET init_conf == [generation |-> 1, members |-> init_members, newMembers |-> NULL] IN + \* refer to RestartProposer why it is not NULL + /\ prop_conf = [p \in proposers |-> init_conf] + /\ acc_conf = [a \in acceptors |-> init_conf] + /\ conf_store = init_conf + \* We could start with anything, but to reduce state space state with + \* the most reasonable total acceptors - 1 conf size, which e.g. + \* makes basic {a1} -> {a2} change in {a1, a2} acceptors and {a1, a2, + \* a3} -> {a2, a3, a4} in {a1, a2, a3, a4} acceptors models even in + \* the smallest models with single change. + /\ Cardinality(init_members) = Cardinality(acceptors) - 1 + +\******************************************************************************** +\* Actions +\******************************************************************************** + +\* Proposer p loses all state, restarting. In the static spec we bump restarted +\* proposer term to max of some quorum + 1 which is a minimal term which can win +\* election. With reconfigurations it's harder to calculate such a term, so keep +\* it simple and take random acceptor one + 1. +\* +\* Also make proposer to adopt configuration of another random acceptor. In the +\* impl proposer starts with NULL configuration until handshake with first +\* acceptor. Removing this NULL special case makes the spec a bit simpler. +RestartProposer(p) == + /\ \E a \in acceptors: PAS!RestartProposerWithTerm(p, acc_state[a].term + 1) + /\ \E a \in acceptors: prop_conf' = [prop_conf EXCEPT ![p] = acc_conf[a]] + /\ UNCHANGED <> + +\* Acceptor a immediately votes for proposer p. +Vote(p, a) == + \* Configuration must be the same. + /\ prop_conf[p].generation = acc_conf[a].generation + \* And a is expected be a member of it. This is likely redundant as long as + \* becoming leader checks membership (though vote also contributes to max + \* calculation). + /\ \/ a \in prop_conf[p].members + \/ (prop_conf[p].newMembers /= NULL) /\ (a \in prop_conf[p].newMembers) + /\ PAS!Vote(p, a) + /\ UNCHANGED <> + +\* Proposer p gets elected. +BecomeLeader(p) == + /\ prop_state[p].state = "campaign" + \* Votes must form quorum in both sets (if the newMembers exists). + /\ PAS!FormsQuorum(DOMAIN prop_state[p].votes, prop_conf[p].members) + /\ \/ prop_conf[p].newMembers = NULL + \* TLA+ disjunction evaluation doesn't short-circuit for a good reason: + \* https://groups.google.com/g/tlaplus/c/U6tOJ4dsjVM/m/UdOznPCVBwAJ + \* so repeat the null check. + \/ (prop_conf[p].newMembers /= NULL) /\ (PAS!FormsQuorum(DOMAIN prop_state[p].votes, prop_conf[p].newMembers)) + \* DoBecomeLeader will copy WAL of the highest voter to proposer's WAL, so + \* ensure its conf is still the same. In the impl WAL fetching also has to + \* check the configuration. + /\ prop_conf[p].generation = acc_conf[PAS!MaxVoteAcc(p)].generation + /\ \A a \in DOMAIN prop_state[p].votes: prop_conf[p].generation = acc_conf[a].generation + /\ PAS!DoBecomeLeader(p) + /\ UNCHANGED <> + +UpdateTerm(p, a) == + /\ PAS!UpdateTerm(p, a) + /\ UNCHANGED <> + +TruncateWal(p, a) == + /\ prop_state[p].state = "leader" + \* Configuration must be the same. + /\ prop_conf[p].generation = acc_conf[a].generation + /\ PAS!TruncateWal(p, a) + /\ UNCHANGED <> + +NewEntry(p) == + /\ PAS!NewEntry(p) + /\ UNCHANGED <> + +AppendEntry(p, a) == + /\ prop_state[p].state = "leader" + \* Configuration must be the same. + /\ prop_conf[p].generation = acc_conf[a].generation + \* And a is member of it. Ignoring this likely wouldn't hurt, but not useful + \* either. + /\ \/ a \in prop_conf[p].members + \/ (prop_conf[p].newMembers /= NULL) /\ (a \in prop_conf[p].newMembers) + /\ PAS!AppendEntry(p, a) + /\ UNCHANGED <> + +\* see PAS!CommitEntries for comments. +CommitEntries(p) == + /\ prop_state[p].state = "leader" + /\ \E q1 \in PAS!AllMinQuorums(prop_conf[p].members): + LET q1_commit_lsn == PAS!QuorumCommitLsn(p, q1) IN + \* Configuration must be the same. + /\ \A a \in q1: prop_conf[p].generation = acc_conf[a].generation + /\ q1_commit_lsn /= NULL + \* We must collect acks from both quorums, if newMembers is present. + /\ IF prop_conf[p].newMembers = NULL THEN + PAS!DoCommitEntries(p, q1_commit_lsn) + ELSE + \E q2 \in PAS!AllMinQuorums(prop_conf[p].newMembers): + LET q2_commit_lsn == PAS!QuorumCommitLsn(p, q2) IN + \* Configuration must be the same. + /\ \A a \in q1: prop_conf[p].generation = acc_conf[a].generation + /\ q2_commit_lsn /= NULL + /\ PAS!DoCommitEntries(p, PAS!Min(q1_commit_lsn, q2_commit_lsn)) + /\ UNCHANGED <> + +\* Proposer p adopts higher conf c from conf store or from some acceptor. +ProposerSwitchConf(p) == + /\ \E c \in ({conf_store} \union {acc_conf[a]: a \in acceptors}): + \* p's conf is lower than c. + /\ (c.generation > prop_conf[p].generation) + \* We allow to bump conf without restart only when wp is already elected. + \* If it isn't, the votes it has already collected are from the previous + \* configuration and can't be used. + \* + \* So if proposer is in 'campaign' in the impl we would restart preserving + \* conf and increasing term. In the spec this transition is already covered + \* by more a generic RestartProposer, so we don't specify it here. + /\ prop_state[p].state = "leader" + /\ prop_conf' = [prop_conf EXCEPT ![p] = c] + /\ UNCHANGED <> + +\* Do CAS on the conf store, starting change into the new_members conf. +StartChange(new_members) == + \* Possible only if we don't already have the change in progress. + /\ conf_store.newMembers = NULL + \* Not necessary, but reduces space a bit. + /\ new_members /= conf_store.members + /\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.members, newMembers |-> new_members] + /\ UNCHANGED <> + +\* Acceptor's last_log_term. +AccLastLogTerm(acc) == + PAS!LastLogTerm(PAS!AcceptorTermHistory(acc)) + +\* Do CAS on the conf store, transferring joint conf into the newMembers only. +FinishChange == + \* have joint conf + /\ conf_store.newMembers /= NULL + \* The conditions for finishing the change are: + /\ \E qo \in PAS!AllMinQuorums(conf_store.members): + \* 1) Old majority must be aware of the joint conf. + \* Note: generally the driver can't know current acceptor + \* generation, it can only know that it once had been the + \* expected one, but it might have advanced since then. + \* But as explained at the top of the file if acceptor gen + \* advanced, FinishChange will never be able to complete + \* due to CAS anyway. We use strict equality here because + \* that's what makes sense conceptually (old driver should + \* abandon its attempt if it observes that conf has advanced). + /\ \A a \in qo: conf_store.generation = acc_conf[a].generation + \* 2) New member set must have log synced, i.e. some its majority needs + \* to have at least as high as max of some + \* old majority. + \* 3) Term must be synced, i.e. some majority of the new set must + \* have term >= than max term of some old majority. + \* This ensures that two leaders are never elected with the same + \* term even after config change (which would be bad unless we treat + \* generation as a part of term which we don't). + \* 4) A majority of the new set must be aware of the joint conf. + \* This allows to safely destoy acceptor state if it is not a + \* member of its current conf (which is useful for cleanup after + \* migration as well as for aborts). + /\ LET sync_pos == PAS!MaxTermLsn({[term |-> AccLastLogTerm(a), lsn |-> PAS!FlushLsn(a)]: a \in qo}) + sync_term == PAS!Maximum({acc_state[a].term: a \in qo}) + IN + \E qn \in PAS!AllMinQuorums(conf_store.newMembers): + \A a \in qn: + /\ PAS!TermLsnGE([term |-> AccLastLogTerm(a), lsn |-> PAS!FlushLsn(a)], sync_pos) + /\ acc_state[a].term >= sync_term + \* The same note as above about strict equality applies here. + /\ conf_store.generation = acc_conf[a].generation + /\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.newMembers, newMembers |-> NULL] + /\ UNCHANGED <> + +\* Do CAS on the conf store, aborting the change in progress. +AbortChange == + \* have joint conf + /\ conf_store.newMembers /= NULL + /\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.members, newMembers |-> NULL] + /\ UNCHANGED <> + +\* Acceptor a switches to higher configuration from the conf store +\* or from some proposer. +AccSwitchConf(a) == + /\ \E c \in ({conf_store} \union {prop_conf[p]: p \in proposers}): + /\ acc_conf[a].generation < c.generation + /\ acc_conf' = [acc_conf EXCEPT ![a] = c] + /\ UNCHANGED <> + +\* Nuke all acceptor state if it is not a member of its current conf. Models +\* cleanup after migration/abort. +AccReset(a) == + /\ \/ (acc_conf[a].newMembers = NULL) /\ (a \notin acc_conf[a].members) + \/ (acc_conf[a].newMembers /= NULL) /\ (a \notin (acc_conf[a].members \union acc_conf[a].newMembers)) + /\ acc_state' = [acc_state EXCEPT ![a] = PAS!InitAcc] + \* Set nextSendLsn to `a` to NULL everywhere. nextSendLsn serves as a mark + \* that elected proposer performed TruncateWal on the acceptor, which isn't + \* true anymore after state reset. In the impl local deletion is expected to + \* terminate all existing connections. + /\ prop_state' = [p \in proposers |-> [prop_state[p] EXCEPT !.nextSendLsn[a] = NULL]] + /\ UNCHANGED <> + +\******************************************************************************* +\* Final spec +\******************************************************************************* + +Next == + \/ \E p \in proposers: RestartProposer(p) + \/ \E p \in proposers: \E a \in acceptors: Vote(p, a) + \/ \E p \in proposers: BecomeLeader(p) + \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a) + \/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a) + \/ \E p \in proposers: NewEntry(p) + \/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a) + \/ \E p \in proposers: CommitEntries(p) + \/ \E new_members \in SUBSET acceptors: StartChange(new_members) + \/ FinishChange + \/ AbortChange + \/ \E p \in proposers: ProposerSwitchConf(p) + \/ \E a \in acceptors: AccSwitchConf(a) + \/ \E a \in acceptors: AccReset(a) + +Spec == Init /\ [][Next]_<> + +\******************************************************************************** +\* Invariants +\******************************************************************************** + +AllConfs == + {conf_store} \union {prop_conf[p]: p \in proposers} \union {acc_conf[a]: a \in acceptors} + +\* Fairly trivial (given the conf store) invariant that different configurations +\* with the same generation are never issued. +ConfigSafety == + \A c1, c2 \in AllConfs: + (c1.generation = c2.generation) => (c1 = c2) + +ElectionSafety == PAS!ElectionSafety + +ElectionSafetyFull == PAS!ElectionSafetyFull + +LogIsMonotonic == PAS!LogIsMonotonic + +LogSafety == PAS!LogSafety + +\******************************************************************************** +\* Invariants which don't need to hold, but useful for playing/debugging. +\******************************************************************************** + +\* Check that we ever switch into non joint conf. +MaxAccConf == ~ \E a \in acceptors: + /\ acc_conf[a].generation = 3 + /\ acc_conf[a].newMembers /= NULL + +CommittedNotTruncated == PAS!CommittedNotTruncated + +MaxTerm == PAS!MaxTerm + +MaxStoreConf == conf_store.generation <= 1 + +MaxAccWalLen == PAS!MaxAccWalLen + +MaxCommitLsn == PAS!MaxCommitLsn + +==== diff --git a/safekeeper/spec/ProposerAcceptorStatic.tla b/safekeeper/spec/ProposerAcceptorStatic.tla index b2d2f005db..fab085bc2e 100644 --- a/safekeeper/spec/ProposerAcceptorStatic.tla +++ b/safekeeper/spec/ProposerAcceptorStatic.tla @@ -18,7 +18,7 @@ \* - old WAL is immediately copied to proposer on its election, without on-demand fetch later. \* Some ideas how to break it to play around to get a feeling: -\* - replace Quorums with BadQuorums. +\* - replace Quorum with BadQuorum. \* - remove 'don't commit entries from previous terms separately' rule in \* CommitEntries and observe figure 8 from the raft paper. \* With p2a3t4l4 32 steps error was found in 1h on 80 cores. @@ -69,16 +69,26 @@ Upsert(f, k, v, l(_)) == \***************** -NumAccs == Cardinality(acceptors) +\* Does set of acceptors `acc_set` form the quorum in the member set `members`? +\* Acceptors not from `members` are excluded (matters only for reconfig). +FormsQuorum(acc_set, members) == + Cardinality(acc_set \intersect members) >= (Cardinality(members) \div 2 + 1) -\* does acc_set form the quorum? -Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1) -\* all quorums of acceptors -Quorums == {subset \in SUBSET acceptors: Quorum(subset)} +\* Like FormsQuorum, but for minimal quorum. +FormsMinQuorum(acc_set, members) == + Cardinality(acc_set \intersect members) = (Cardinality(members) \div 2 + 1) -\* For substituting Quorums and seeing what happens. -BadQuorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2) -BadQuorums == {subset \in SUBSET acceptors: BadQuorum(subset)} +\* All sets of acceptors forming minimal quorums in the member set `members`. +AllQuorums(members) == {subset \in SUBSET members: FormsQuorum(subset, members)} +AllMinQuorums(members) == {subset \in SUBSET acceptors: FormsMinQuorum(subset, members)} + +\* For substituting Quorum and seeing what happens. +FormsBadQuorum(acc_set, members) == + Cardinality(acc_set \intersect members) >= (Cardinality(members) \div 2) +FormsMinBadQuorum(acc_set, members) == + Cardinality(acc_set \intersect members) = (Cardinality(members) \div 2) +AllBadQuorums(members) == {subset \in SUBSET acceptors: FormsBadQuorum(subset, members)} +AllMinBadQuorums(members) == {subset \in SUBSET acceptors: FormsMinBadQuorum(subset, members)} \* flushLsn (end of WAL, i.e. index of next entry) of acceptor a. FlushLsn(a) == Len(acc_state[a].wal) + 1 @@ -135,10 +145,11 @@ TypeOk == /\ IsWal(prop_state[p].wal) \* Map of acceptor -> next lsn to send. It is set when truncate_wal is \* done so sending entries is allowed only after that. In the impl TCP - \* ensures this ordering. + \* ensures this ordering. We use NULL instead of missing value to use + \* EXCEPT in AccReset. /\ \A a \in DOMAIN prop_state[p].nextSendLsn: /\ a \in acceptors - /\ prop_state[p].nextSendLsn[a] \in Lsns + /\ prop_state[p].nextSendLsn[a] \in Lsns \union {NULL} /\ \A a \in acceptors: /\ DOMAIN acc_state[a] = {"term", "termHistory", "wal"} /\ acc_state[a].term \in Terms @@ -167,6 +178,19 @@ TypeOk == \* Initial \******************************************************************************** +InitAcc == + [ + \* There will be no leader in zero term, 1 is the first + \* real. + term |-> 0, + \* Again, leader in term 0 doesn't exist, but we initialize + \* term histories with it to always have common point in + \* them. Lsn is 1 because TLA+ sequences are indexed from 1 + \* (we don't want to truncate WAL out of range). + termHistory |-> << [term |-> 0, lsn |-> 1] >>, + wal |-> << >> + ] + Init == /\ prop_state = [p \in proposers |-> [ state |-> "campaign", @@ -174,19 +198,9 @@ Init == votes |-> EmptyF, termHistory |-> << >>, wal |-> << >>, - nextSendLsn |-> EmptyF + nextSendLsn |-> [a \in acceptors |-> NULL] ]] - /\ acc_state = [a \in acceptors |-> [ - \* There will be no leader in zero term, 1 is the first - \* real. - term |-> 0, - \* Again, leader in term 0 doesn't exist, but we initialize - \* term histories with it to always have common point in - \* them. Lsn is 1 because TLA+ sequences are indexed from 1 - \* (we don't want to truncate WAL out of range). - termHistory |-> << [term |-> 0, lsn |-> 1] >>, - wal |-> << >> - ]] + /\ acc_state = [a \in acceptors |-> InitAcc] /\ committed = {} /\ elected_history = EmptyF @@ -195,23 +209,35 @@ Init == \* Actions \******************************************************************************** -\* Proposer loses all state. +RestartProposerWithTerm(p, new_term) == + /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign", + ![p].term = new_term, + ![p].votes = EmptyF, + ![p].termHistory = << >>, + ![p].wal = << >>, + ![p].nextSendLsn = [a \in acceptors |-> NULL]] + /\ UNCHANGED <> + +\* Proposer p loses all state, restarting. \* For simplicity (and to reduct state space), we assume it immediately gets \* current state from quorum q of acceptors determining the term he will request \* to vote for. -RestartProposer(p, q) == - /\ Quorum(q) - /\ LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN - /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign", - ![p].term = new_term, - ![p].votes = EmptyF, - ![p].termHistory = << >>, - ![p].wal = << >>, - ![p].nextSendLsn = EmptyF] - /\ UNCHANGED <> +RestartProposer(p) == + \E q \in AllQuorums(acceptors): + LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN + RestartProposerWithTerm(p, new_term) \* Term history of acceptor a's WAL: the one saved truncated to contain only <= -\* local FlushLsn entries. +\* local FlushLsn entries. Note that FlushLsn is the end LSN of the last entry +\* (and begin LSN of the next). The mental model for non strict comparison is +\* that once proposer is elected it immediately writes log record with zero +\* length. This allows leader to commit existing log without writing any new +\* entries. For example, assume acceptor has WAL +\* 1.1, 1.2 +\* written by prop with term 1; its current +\* is <1, 3>. Now prop with term 2 and max vote from this acc is elected. +\* Once TruncateWAL is done, becomes <2, 3> +\* without any new records explicitly written. AcceptorTermHistory(a) == SelectSeq(acc_state[a].termHistory, LAMBDA th_entry: th_entry.lsn <= FlushLsn(a)) @@ -230,35 +256,52 @@ Vote(p, a) == \* Get lastLogTerm from term history th. LastLogTerm(th) == th[Len(th)].term +\* Compares pairs: returns true if tl1 >= tl2. +TermLsnGE(tl1, tl2) == + /\ tl1.term >= tl2.term + /\ (tl1.term = tl2.term => tl1.lsn >= tl2.lsn) + +\* Choose max pair in the non empty set of them. +MaxTermLsn(term_lsn_set) == + CHOOSE max_tl \in term_lsn_set: \A tl \in term_lsn_set: TermLsnGE(max_tl, tl) + +\* Find acceptor with the highest vote in proposer p's votes. +MaxVoteAcc(p) == + CHOOSE a \in DOMAIN prop_state[p].votes: + LET a_vote == prop_state[p].votes[a] + a_vote_term_lsn == [term |-> LastLogTerm(a_vote.termHistory), lsn |-> a_vote.flushLsn] + vote_term_lsns == {[term |-> LastLogTerm(v.termHistory), lsn |-> v.flushLsn]: v \in Range(prop_state[p].votes)} + IN + a_vote_term_lsn = MaxTermLsn(vote_term_lsns) + +\* Workhorse for BecomeLeader. +\* Assumes the check prop_state[p] votes is quorum has been done *outside*. +DoBecomeLeader(p) == + LET + \* Find acceptor with the highest vote. + max_vote_acc == MaxVoteAcc(p) + max_vote == prop_state[p].votes[max_vote_acc] + prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn]) + IN + \* We copy all log preceding proposer's term from the max vote node so + \* make sure it is still on one term with us. This is a model + \* simplification which can be removed, in impl we fetch WAL on demand + \* from safekeeper which has it later. Note though that in case of on + \* demand fetch we must check on donor not only term match, but that + \* truncate_wal had already been done (if it is not max_vote_acc). + /\ acc_state[max_vote_acc].term = prop_state[p].term + /\ prop_state' = [prop_state EXCEPT ![p].state = "leader", + ![p].termHistory = prop_th, + ![p].wal = acc_state[max_vote_acc].wal + ] + /\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1) + /\ UNCHANGED <> + \* Proposer p gets elected. BecomeLeader(p) == /\ prop_state[p].state = "campaign" - /\ Quorum(DOMAIN prop_state[p].votes) - /\ LET - \* Find acceptor with the highest vote. - max_vote_acc == - CHOOSE a \in DOMAIN prop_state[p].votes: - LET v == prop_state[p].votes[a] - IN \A v2 \in Range(prop_state[p].votes): - /\ LastLogTerm(v.termHistory) >= LastLogTerm(v2.termHistory) - /\ (LastLogTerm(v.termHistory) = LastLogTerm(v2.termHistory) => v.flushLsn >= v2.flushLsn) - max_vote == prop_state[p].votes[max_vote_acc] - prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn]) - IN - \* We copy all log preceding proposer's term from the max vote node so - \* make sure it is still on one term with us. This is a model - \* simplification which can be removed, in impl we fetch WAL on demand - \* from safekeeper which has it later. Note though that in case of on - \* demand fetch we must check on donor not only term match, but that - \* truncate_wal had already been done (if it is not max_vote_acc). - /\ acc_state[max_vote_acc].term = prop_state[p].term - /\ prop_state' = [prop_state EXCEPT ![p].state = "leader", - ![p].termHistory = prop_th, - ![p].wal = acc_state[max_vote_acc].wal - ] - /\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1) - /\ UNCHANGED <> - + /\ FormsQuorum(DOMAIN prop_state[p].votes, acceptors) + /\ DoBecomeLeader(p) \* Acceptor a learns about elected proposer p's term. In impl it matches to \* VoteRequest/VoteResponse exchange when leader is already elected and is not @@ -287,10 +330,11 @@ FindHighestCommonPoint(prop_th, acc_th, acc_flush_lsn) == IN [term |-> last_common_term, lsn |-> Min(acc_common_term_end, prop_common_term_end)] -\* Elected proposer p immediately truncates WAL (and term history) of acceptor a -\* before starting streaming. Establishes nextSendLsn for a. +\* Elected proposer p immediately truncates WAL (and sets term history) of +\* acceptor a before starting streaming. Establishes nextSendLsn for a. \* -\* In impl this happens at each reconnection, here we also allow to do it multiple times. +\* In impl this happens at each reconnection, here we also allow to do it +\* multiple times. TruncateWal(p, a) == /\ prop_state[p].state = "leader" /\ acc_state[a].term = prop_state[p].term @@ -321,8 +365,8 @@ NewEntry(p) == AppendEntry(p, a) == /\ prop_state[p].state = "leader" /\ acc_state[a].term = prop_state[p].term - /\ a \in DOMAIN prop_state[p].nextSendLsn \* did TruncateWal - /\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal) \* have smth to send + /\ prop_state[p].nextSendLsn[a] /= NULL \* did TruncateWal + /\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal) \* have smth to send /\ LET send_lsn == prop_state[p].nextSendLsn[a] entry == prop_state[p].wal[send_lsn] @@ -337,41 +381,65 @@ AppendEntry(p, a) == PropStartLsn(p) == IF prop_state[p].state = "leader" THEN prop_state[p].termHistory[Len(prop_state[p].termHistory)].lsn ELSE NULL -\* Proposer p commits all entries it can using quorum q. Note that unlike -\* will62794/logless-reconfig this allows to commit entries from previous terms -\* (when conditions for that are met). -CommitEntries(p, q) == - /\ prop_state[p].state = "leader" - /\ \A a \in q: +\* LSN which can be committed by proposer p using min quorum q (check that q +\* forms quorum must have been done outside). NULL if there is none. +QuorumCommitLsn(p, q) == + IF + /\ prop_state[p].state = "leader" + /\ \A a \in q: + \* Without explicit responses to appends this ensures that append + \* up to FlushLsn has been accepted. /\ acc_state[a].term = prop_state[p].term \* nextSendLsn existence means TruncateWal has happened, it ensures \* acceptor's WAL (and FlushLsn) are from proper proposer's history. \* Alternatively we could compare LastLogTerm here, but that's closer to \* what we do in the impl (we check flushLsn in AppendResponse, but \* AppendRequest is processed only if HandleElected handling was good). - /\ a \in DOMAIN prop_state[p].nextSendLsn - \* Now find the LSN present on all the quorum. - /\ LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN - \* This is the basic Raft rule of not committing entries from previous - \* terms except along with current term entry (commit them only when - \* quorum recovers, i.e. last_log_term on it reaches leader's term). - /\ quorum_lsn >= PropStartLsn(p) - /\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(quorum_lsn - 1)} - /\ UNCHANGED <> + /\ prop_state[p].nextSendLsn[a] /= NULL + THEN + \* Now find the LSN present on all the quorum. + LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN + \* This is the basic Raft rule of not committing entries from previous + \* terms except along with current term entry (commit them only when + \* quorum recovers, i.e. last_log_term on it reaches leader's term). + IF quorum_lsn >= PropStartLsn(p) THEN + quorum_lsn + ELSE + NULL + ELSE + NULL + +\* Commit all entries on proposer p with record lsn < commit_lsn. +DoCommitEntries(p, commit_lsn) == + /\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(commit_lsn - 1)} + /\ UNCHANGED <> + +\* Proposer p commits all entries it can using some quorum. Note that unlike +\* will62794/logless-reconfig this allows to commit entries from previous terms +\* (when conditions for that are met). +CommitEntries(p) == + /\ prop_state[p].state = "leader" + \* Using min quorums here is better because 1) QuorumCommitLsn for + \* simplicity checks min across all accs in q. 2) it probably makes + \* evaluation faster. + /\ \E q \in AllMinQuorums(acceptors): + LET commit_lsn == QuorumCommitLsn(p, q) IN + /\ commit_lsn /= NULL + /\ DoCommitEntries(p, commit_lsn) \******************************************************************************* \* Final spec \******************************************************************************* Next == - \/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q) + \/ \E p \in proposers: RestartProposer(p) \/ \E p \in proposers: \E a \in acceptors: Vote(p, a) \/ \E p \in proposers: BecomeLeader(p) \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a) \/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a) \/ \E p \in proposers: NewEntry(p) \/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a) - \/ \E q \in Quorums: \E p \in proposers: CommitEntries(p, q) + \/ \E p \in proposers: CommitEntries(p) Spec == Init /\ [][Next]_<> diff --git a/safekeeper/spec/modelcheck.sh b/safekeeper/spec/modelcheck.sh index 21ead7dad8..0084a8c638 100755 --- a/safekeeper/spec/modelcheck.sh +++ b/safekeeper/spec/modelcheck.sh @@ -2,6 +2,7 @@ # Usage: ./modelcheck.sh , e.g. # ./modelcheck.sh models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg MCProposerAcceptorStatic.tla +# ./modelcheck.sh models/MCProposerAcceptorReconfig_p2_a3_t3_l3_c3.cfg MCProposerAcceptorReconfig.tla CONFIG=$1 SPEC=$2 @@ -12,6 +13,7 @@ mkdir -p "tlc-results" CONFIG_FILE=$(basename -- "$CONFIG") outfilename="$SPEC-${CONFIG_FILE}-$(date --utc +%Y-%m-%d--%H-%M-%S)".log outfile="tlc-results/$outfilename" +echo "saving results to $outfile" touch $outfile # Save some info about the run. @@ -45,5 +47,6 @@ echo "" >> $outfile # https://docs.tlapl.us/codebase:architecture#fingerprint_sets_fpsets # # Add -simulate to run in infinite simulation mode. +# -coverage 1 is useful for profiling (check how many times actions are taken). java -Xmx$MEM -XX:MaxDirectMemorySize=$MEM -XX:+UseParallelGC -Dtlc2.tool.fp.FPSet.impl=tlc2.tool.fp.OffHeapDiskFPSet \ -cp "${TOOLSPATH}" tlc2.TLC $SPEC -config $CONFIG -workers auto -gzip | tee -a $outfile diff --git a/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg new file mode 100644 index 0000000000..8d34751083 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg @@ -0,0 +1,21 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2} +max_term = 2 +max_entries = 2 +max_generation = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ConfigSafety +ElectionSafetyFull +LogIsMonotonic +LogSafety +\* As its comment explains generally it is not expected to hold, but +\* in such small model it is true. +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias diff --git a/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg new file mode 100644 index 0000000000..eb7e0768ff --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg @@ -0,0 +1,19 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2} +max_term = 2 +max_entries = 2 +max_generation = 5 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ConfigSafety +ElectionSafetyFull +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias diff --git a/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a3_t2_l2_c3.cfg b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a3_t2_l2_c3.cfg new file mode 100644 index 0000000000..b5fae13880 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a3_t2_l2_c3.cfg @@ -0,0 +1,20 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 2 +max_entries = 2 +max_generation = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ConfigSafety +ElectionSafetyFull +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + diff --git a/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a4_t2_l2_c3.cfg b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a4_t2_l2_c3.cfg new file mode 100644 index 0000000000..71af9fa367 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a4_t2_l2_c3.cfg @@ -0,0 +1,19 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3, a4} +max_term = 2 +max_entries = 2 +max_generation = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafetyFull +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + diff --git a/safekeeper/spec/remove_interm_progress.awk b/safekeeper/spec/remove_interm_progress.awk new file mode 100644 index 0000000000..6203f6fa4f --- /dev/null +++ b/safekeeper/spec/remove_interm_progress.awk @@ -0,0 +1,25 @@ +# Print all lines, but thin out lines starting with Progress: +# leave only first and last 5 ones in the beginning, and only 1 of 1440 +# of others (once a day). +# Also remove checkpointing logs. +{ + lines[NR] = $0 +} +$0 ~ /^Progress/ { + ++pcount +} +END { + progress_idx = 0 + for (i = 1; i <= NR; i++) { + if (lines[i] ~ /^Progress/) { + if (progress_idx < 5 || progress_idx >= pcount - 5 || progress_idx % 1440 == 0) { + print lines[i] + } + progress_idx++ + } + else if (lines[i] ~ /^Checkpointing/) {} + else { + print lines[i] + } + } +} \ No newline at end of file diff --git a/safekeeper/spec/remove_interm_progress.sh b/safekeeper/spec/remove_interm_progress.sh new file mode 100755 index 0000000000..a8724a2b92 --- /dev/null +++ b/safekeeper/spec/remove_interm_progress.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +awk -f remove_interm_progress.awk $1 > $1.thin \ No newline at end of file diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg-2024-12-11--04-24-12.log b/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg-2024-12-11--04-24-12.log new file mode 100644 index 0000000000..8aac9eb58c --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg-2024-12-11--04-24-12.log @@ -0,0 +1,65 @@ +git revision: 9e386917a +Platform: Linux neon-dev-arm64-1 6.8.0-49-generic #49-Ubuntu SMP PREEMPT_DYNAMIC Sun Nov 3 21:21:58 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux +CPU Info Linux: Neoverse-N1 +CPU Cores Linux: 80 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorReconfig.tla +Config: models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg +---- +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2} +max_term = 2 +max_entries = 2 +max_generation = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafetyFull +LogIsMonotonic +LogSafety +\* CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) +Running breadth-first search Model-Checking with fp 99 and seed -9189733667206762985 with 35 workers on 80 cores with 27307MB heap and 30720MB offheap memory [pid: 391272] (Linux 6.8.0-49-generic aarch64, Ubuntu 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/arseny/neon2/safekeeper/spec/MCProposerAcceptorReconfig.tla +Parsing file /tmp/tlc-3211535543066978921/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorReconfig.tla +Parsing file /tmp/tlc-3211535543066978921/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-3211535543066978921/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-3211535543066978921/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-3211535543066978921/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-3211535543066978921/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-3211535543066978921/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module ProposerAcceptorReconfig +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorReconfig +Starting... (2024-12-11 04:24:13) +Computing initial states... +Finished computing initial states: 2 states generated, with 1 of them distinct at 2024-12-11 04:24:15. +Progress(16) at 2024-12-11 04:24:18: 1,427,589 states generated (1,427,589 s/min), 142,472 distinct states found (142,472 ds/min), 47,162 states left on queue. +Model checking completed. No error has been found. + Estimates of the probability that TLC did not check all reachable states + because two distinct states had the same fingerprint: + calculated (optimistic): val = 1.0E-6 + based on the actual fingerprints: val = 4.2E-8 +17746857 states generated, 1121659 distinct states found, 0 states left on queue. +The depth of the complete state graph search is 37. +The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 9 and the 95th percentile is 3). +Finished in 33s at (2024-12-11 04:24:46) diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg-2024-12-11--04-26-11.log b/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg-2024-12-11--04-26-11.log new file mode 100644 index 0000000000..40e7611ae3 --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg-2024-12-11--04-26-11.log @@ -0,0 +1,64 @@ +git revision: 9e386917a +Platform: Linux neon-dev-arm64-1 6.8.0-49-generic #49-Ubuntu SMP PREEMPT_DYNAMIC Sun Nov 3 21:21:58 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux +CPU Info Linux: Neoverse-N1 +CPU Cores Linux: 80 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorReconfig.tla +Config: models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg +---- +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2} +max_term = 2 +max_entries = 2 +max_generation = 5 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafetyFull +LogIsMonotonic +LogSafety +\* CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) +Running breadth-first search Model-Checking with fp 114 and seed -8099467489737745861 with 35 workers on 80 cores with 27307MB heap and 30720MB offheap memory [pid: 392020] (Linux 6.8.0-49-generic aarch64, Ubuntu 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/arseny/neon2/safekeeper/spec/MCProposerAcceptorReconfig.tla +Parsing file /tmp/tlc-11757875725969857497/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorReconfig.tla +Parsing file /tmp/tlc-11757875725969857497/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-11757875725969857497/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-11757875725969857497/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-11757875725969857497/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-11757875725969857497/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-11757875725969857497/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module ProposerAcceptorReconfig +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorReconfig +Starting... (2024-12-11 04:26:12) +Computing initial states... +Finished computing initial states: 2 states generated, with 1 of them distinct at 2024-12-11 04:26:14. +Progress(14) at 2024-12-11 04:26:17: 1,519,385 states generated (1,519,385 s/min), 231,263 distinct states found (231,263 ds/min), 121,410 states left on queue. +Progress(20) at 2024-12-11 04:27:17: 42,757,204 states generated (41,237,819 s/min), 4,198,386 distinct states found (3,967,123 ds/min), 1,308,109 states left on queue. +Progress(22) at 2024-12-11 04:28:17: 83,613,929 states generated (40,856,725 s/min), 7,499,873 distinct states found (3,301,487 ds/min), 1,929,464 states left on queue. +Progress(23) at 2024-12-11 04:29:17: 124,086,758 states generated (40,472,829 s/min), 10,569,712 distinct states found (3,069,839 ds/min), 2,386,988 states left on queue. +Progress(24) at 2024-12-11 04:30:17: 163,412,538 states generated (39,325,780 s/min), 13,314,303 distinct states found (2,744,591 ds/min), 2,610,637 states left on queue. +Progress(25) at 2024-12-11 04:31:17: 202,643,708 states generated (39,231,170 s/min), 15,960,583 distinct states found (2,646,280 ds/min), 2,759,681 states left on queue. +Progress(26) at 2024-12-11 04:32:17: 240,681,633 states generated (38,037,925 s/min), 18,443,440 distinct states found (2,482,857 ds/min), 2,852,177 states left on queue. +Progress(27) at 2024-12-11 04:33:17: 278,559,134 states generated (37,877,501 s/min), 20,878,067 distinct states found (2,434,627 ds/min), 2,904,400 states left on queue. +Progress(28) at 2024-12-11 04:34:17: 316,699,911 states generated (38,140,777 s/min), 23,212,229 distinct states found (2,334,162 ds/min), 2,864,969 states left on queue. diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index e0ba38d638..13f6e34575 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -51,12 +51,10 @@ use utils::{ #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). -// TODO: disabled because concurrent CPU profiles cause seg faults. See: -// https://github.com/neondatabase/neon/issues/10225. -//#[allow(non_upper_case_globals)] -//#[export_name = "malloc_conf"] -//pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; +/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; const PID_FILE_NAME: &str = "safekeeper.pid"; const ID_FILE_NAME: &str = "safekeeper.id"; diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs index a981b5020e..f8a2790769 100644 --- a/storage_controller/client/src/control_api.rs +++ b/storage_controller/client/src/control_api.rs @@ -1,7 +1,6 @@ use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; use reqwest::{Method, Url}; use serde::{de::DeserializeOwned, Serialize}; -use std::str::FromStr; pub struct Client { base_url: Url, @@ -31,16 +30,11 @@ impl Client { RQ: Serialize + Sized, RS: DeserializeOwned + Sized, { - // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out - // for general purpose API access. - let url = Url::from_str(&format!( - "http://{}:{}/{path}", - self.base_url.host_str().unwrap(), - self.base_url.port().unwrap() - )) - .unwrap(); - - let mut builder = self.client.request(method, url); + let request_path = self + .base_url + .join(&path) + .expect("Failed to build request path"); + let mut builder = self.client.request(method, request_path); if let Some(body) = body { builder = builder.json(&body) } diff --git a/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/down.sql b/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/down.sql new file mode 100644 index 0000000000..e26bff798f --- /dev/null +++ b/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/down.sql @@ -0,0 +1 @@ +ALTER TABLE safekeepers DROP scheduling_policy; diff --git a/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/up.sql b/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/up.sql new file mode 100644 index 0000000000..d83cc6cc46 --- /dev/null +++ b/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/up.sql @@ -0,0 +1 @@ +ALTER TABLE safekeepers ADD scheduling_policy VARCHAR NOT NULL DEFAULT 'disabled'; diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 24fd4c341a..5385e4ee0b 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -3,7 +3,7 @@ use crate::metrics::{ HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup, METRICS_REGISTRY, }; -use crate::persistence::SafekeeperPersistence; +use crate::persistence::SafekeeperUpsert; use crate::reconciler::ReconcileError; use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT}; use anyhow::Context; @@ -1249,7 +1249,7 @@ async fn handle_get_safekeeper(req: Request) -> Result, Api async fn handle_upsert_safekeeper(mut req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Infra)?; - let body = json_request::(&mut req).await?; + let body = json_request::(&mut req).await?; let id = parse_request_param::(&req, "id")?; if id != body.id { diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs index fcd3eb57e2..2d8b674f86 100644 --- a/storage_controller/src/id_lock_map.rs +++ b/storage_controller/src/id_lock_map.rs @@ -112,6 +112,14 @@ where } } + pub(crate) fn try_exclusive(&self, key: T, operation: I) -> Option> { + let mut locked = self.entities.lock().unwrap(); + let entry = locked.entry(key).or_default().clone(); + let mut guard = TracingExclusiveGuard::new(entry.try_write_owned().ok()?); + *guard.guard = Some(operation); + Some(guard) + } + /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do /// periodic housekeeping to avoid the map growing indefinitely pub(crate) fn housekeeping(&self) { diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index cc377e606e..cebf3e9594 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -13,6 +13,7 @@ use pageserver_api::controller_api::AvailabilityZone; use pageserver_api::controller_api::MetadataHealthRecord; use pageserver_api::controller_api::SafekeeperDescribeResponse; use pageserver_api::controller_api::ShardSchedulingPolicy; +use pageserver_api::controller_api::SkSchedulingPolicy; use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; use pageserver_api::models::TenantConfig; use pageserver_api::shard::ShardConfigError; @@ -97,6 +98,7 @@ pub(crate) enum DatabaseOperation { TenantGenerations, ShardGenerations, ListTenantShards, + LoadTenant, InsertTenantShards, UpdateTenantShard, DeleteTenant, @@ -330,11 +332,40 @@ impl Persistence { /// At startup, load the high level state for shards, such as their config + policy. This will /// be enriched at runtime with state discovered on pageservers. - pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult> { + /// + /// We exclude shards configured to be detached. During startup, if we see any attached locations + /// for such shards, they will automatically be detached as 'orphans'. + pub(crate) async fn load_active_tenant_shards( + &self, + ) -> DatabaseResult> { + use crate::schema::tenant_shards::dsl::*; self.with_measured_conn( DatabaseOperation::ListTenantShards, move |conn| -> DatabaseResult<_> { - Ok(crate::schema::tenant_shards::table.load::(conn)?) + let query = tenant_shards.filter( + placement_policy.ne(serde_json::to_string(&PlacementPolicy::Detached).unwrap()), + ); + let result = query.load::(conn)?; + + Ok(result) + }, + ) + .await + } + + /// When restoring a previously detached tenant into memory, load it from the database + pub(crate) async fn load_tenant( + &self, + filter_tenant_id: TenantId, + ) -> DatabaseResult> { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn( + DatabaseOperation::LoadTenant, + move |conn| -> DatabaseResult<_> { + let query = tenant_shards.filter(tenant_id.eq(filter_tenant_id.to_string())); + let result = query.load::(conn)?; + + Ok(result) }, ) .await @@ -1045,12 +1076,14 @@ impl Persistence { pub(crate) async fn safekeeper_upsert( &self, - record: SafekeeperPersistence, + record: SafekeeperUpsert, ) -> Result<(), DatabaseError> { use crate::schema::safekeepers::dsl::*; self.with_conn(move |conn| -> DatabaseResult<()> { - let bind = record.as_insert_or_update(); + let bind = record + .as_insert_or_update() + .map_err(|e| DatabaseError::Logical(format!("{e}")))?; let inserted_updated = diesel::insert_into(safekeepers) .values(&bind) @@ -1213,6 +1246,7 @@ pub(crate) struct ControllerPersistence { pub(crate) started_at: chrono::DateTime, } +// What we store in the database #[derive(Serialize, Deserialize, Queryable, Selectable, Eq, PartialEq, Debug, Clone)] #[diesel(table_name = crate::schema::safekeepers)] pub(crate) struct SafekeeperPersistence { @@ -1227,11 +1261,51 @@ pub(crate) struct SafekeeperPersistence { pub(crate) active: bool, pub(crate) http_port: i32, pub(crate) availability_zone_id: String, + pub(crate) scheduling_policy: String, } impl SafekeeperPersistence { - fn as_insert_or_update(&self) -> InsertUpdateSafekeeper<'_> { - InsertUpdateSafekeeper { + pub(crate) fn as_describe_response(&self) -> Result { + let scheduling_policy = + SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| { + DatabaseError::Logical(format!("can't construct SkSchedulingPolicy: {e:?}")) + })?; + // omit the `active` flag on purpose: it is deprecated. + Ok(SafekeeperDescribeResponse { + id: NodeId(self.id as u64), + region_id: self.region_id.clone(), + version: self.version, + host: self.host.clone(), + port: self.port, + http_port: self.http_port, + availability_zone_id: self.availability_zone_id.clone(), + scheduling_policy, + }) + } +} + +/// What we expect from the upsert http api +#[derive(Serialize, Deserialize, Eq, PartialEq, Debug, Clone)] +pub(crate) struct SafekeeperUpsert { + pub(crate) id: i64, + pub(crate) region_id: String, + /// 1 is special, it means just created (not currently posted to storcon). + /// Zero or negative is not really expected. + /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag. + pub(crate) version: i64, + pub(crate) host: String, + pub(crate) port: i32, + pub(crate) active: bool, + pub(crate) http_port: i32, + pub(crate) availability_zone_id: String, +} + +impl SafekeeperUpsert { + fn as_insert_or_update(&self) -> anyhow::Result> { + if self.version < 0 { + anyhow::bail!("negative version: {}", self.version); + } + Ok(InsertUpdateSafekeeper { id: self.id, region_id: &self.region_id, version: self.version, @@ -1240,19 +1314,9 @@ impl SafekeeperPersistence { active: self.active, http_port: self.http_port, availability_zone_id: &self.availability_zone_id, - } - } - pub(crate) fn as_describe_response(&self) -> SafekeeperDescribeResponse { - // omit the `active` flag on purpose: it is deprecated. - SafekeeperDescribeResponse { - id: NodeId(self.id as u64), - region_id: self.region_id.clone(), - version: self.version, - host: self.host.clone(), - port: self.port, - http_port: self.http_port, - availability_zone_id: self.availability_zone_id.clone(), - } + // None means a wish to not update this column. We expose abilities to update it via other means. + scheduling_policy: None, + }) } } @@ -1267,4 +1331,5 @@ struct InsertUpdateSafekeeper<'a> { active: bool, http_port: i32, availability_zone_id: &'a str, + scheduling_policy: Option<&'a str>, } diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 475f91eff4..e0a854fff7 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -14,7 +14,6 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; use utils::backoff::exponential_backoff; -use utils::failpoint_support; use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use utils::lsn::Lsn; @@ -212,11 +211,12 @@ impl Reconciler { lazy: bool, ) -> Result<(), ReconcileError> { if !node.is_available() && config.mode == LocationConfigMode::Detached { - // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline - // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of - // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`] - tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation"); - self.observed.locations.remove(&node.get_id()); + // [`crate::service::Service::node_activate_reconcile`] will update the observed state + // when the node comes back online. At that point, the intent and observed states will + // be mismatched and a background reconciliation will detach. + tracing::info!( + "Node {node} is unavailable during detach: proceeding anyway, it will be detached via background reconciliation" + ); return Ok(()); } @@ -749,6 +749,8 @@ impl Reconciler { }; if increment_generation { + pausable_failpoint!("reconciler-pre-increment-generation"); + let generation = self .persistence .increment_generation(self.tenant_shard_id, node.get_id()) @@ -824,7 +826,7 @@ impl Reconciler { .handle_detach(self.tenant_shard_id, self.shard.stripe_size); } - failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue"); + pausable_failpoint!("reconciler-epilogue"); Ok(()) } diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index 9e005ab932..44c91619ab 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -39,6 +39,7 @@ diesel::table! { active -> Bool, http_port -> Int4, availability_zone_id -> Text, + scheduling_policy -> Varchar, } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 222cb9fdd4..265b2798d2 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -83,6 +83,7 @@ use utils::{ generation::Generation, http::error::ApiError, id::{NodeId, TenantId, TimelineId}, + pausable_failpoint, sync::gate::Gate, }; @@ -154,6 +155,7 @@ enum TenantOperations { TimelineArchivalConfig, TimelineDetachAncestor, TimelineGcBlockUnblock, + DropDetached, } #[derive(Clone, strum_macros::Display)] @@ -415,8 +417,8 @@ pub struct Service { /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile /// Send into this queue to promptly attempt to reconcile this shard next time units are available. /// - /// Note that this state logically lives inside ServiceInner, but carrying Sender here makes the code simpler - /// by avoiding needing a &mut ref to something inside the ServiceInner. This could be optimized to + /// Note that this state logically lives inside ServiceState, but carrying Sender here makes the code simpler + /// by avoiding needing a &mut ref to something inside the ServiceState. This could be optimized to /// use a VecDeque instead of a channel to reduce synchronization overhead, at the cost of some code complexity. delayed_reconcile_tx: tokio::sync::mpsc::Sender, @@ -1024,6 +1026,8 @@ impl Service { ) .await; + pausable_failpoint!("heartbeat-pre-node-state-configure"); + // This is the code path for geniune availability transitions (i.e node // goes unavailable and/or comes back online). let res = self @@ -1043,6 +1047,9 @@ impl Service { // on a snapshot of the nodes. tracing::info!("Node {} was not found after heartbeat round", node_id); } + Err(ApiError::ShuttingDown) => { + // No-op: we're shutting down, no need to try and update any nodes' statuses + } Err(err) => { // Transition to active involves reconciling: if a node responds to a heartbeat then // becomes unavailable again, we may get an error here. @@ -1162,6 +1169,20 @@ impl Service { } } + // If we just finished detaching all shards for a tenant, it might be time to drop it from memory. + if tenant.policy == PlacementPolicy::Detached { + // We may only drop a tenant from memory while holding the exclusive lock on the tenant ID: this protects us + // from concurrent execution wrt a request handler that might expect the tenant to remain in memory for the + // duration of the request. + let guard = self.tenant_op_locks.try_exclusive( + tenant.tenant_shard_id.tenant_id, + TenantOperations::DropDetached, + ); + if let Some(guard) = guard { + self.maybe_drop_tenant(tenant.tenant_shard_id.tenant_id, &mut locked, &guard); + } + } + // Maybe some other work can proceed now that this job finished. if self.reconciler_concurrency.available_permits() > 0 { while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() { @@ -1291,7 +1312,7 @@ impl Service { .set(nodes.len() as i64); tracing::info!("Loading shards from database..."); - let mut tenant_shard_persistence = persistence.list_tenant_shards().await?; + let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?; tracing::info!( "Loaded {} shards from database.", tenant_shard_persistence.len() @@ -1543,8 +1564,14 @@ impl Service { // the pageserver API (not via this service), we will auto-create any missing tenant // shards with default state. let insert = { - let locked = self.inner.write().unwrap(); - !locked.tenants.contains_key(&attach_req.tenant_shard_id) + match self + .maybe_load_tenant(attach_req.tenant_shard_id.tenant_id, &_tenant_lock) + .await + { + Ok(_) => false, + Err(ApiError::NotFound(_)) => true, + Err(e) => return Err(e.into()), + } }; if insert { @@ -2436,6 +2463,99 @@ impl Service { } } + /// For APIs that might act on tenants with [`PlacementPolicy::Detached`], first check if + /// the tenant is present in memory. If not, load it from the database. If it is found + /// in neither location, return a NotFound error. + /// + /// Caller must demonstrate they hold a lock guard, as otherwise two callers might try and load + /// it at the same time, or we might race with [`Self::maybe_drop_tenant`] + async fn maybe_load_tenant( + &self, + tenant_id: TenantId, + _guard: &TracingExclusiveGuard, + ) -> Result<(), ApiError> { + let present_in_memory = { + let locked = self.inner.read().unwrap(); + locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .next() + .is_some() + }; + + if present_in_memory { + return Ok(()); + } + + let tenant_shards = self.persistence.load_tenant(tenant_id).await?; + if tenant_shards.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {} not found", tenant_id).into(), + )); + } + + // TODO: choose a fresh AZ to use for this tenant when un-detaching: there definitely isn't a running + // compute, so no benefit to making AZ sticky across detaches. + + let mut locked = self.inner.write().unwrap(); + tracing::info!( + "Loaded {} shards for tenant {}", + tenant_shards.len(), + tenant_id + ); + + locked.tenants.extend(tenant_shards.into_iter().map(|p| { + let intent = IntentState::new(); + let shard = + TenantShard::from_persistent(p, intent).expect("Corrupt shard row in database"); + + // Sanity check: when loading on-demand, we should always be loaded something Detached + debug_assert!(shard.policy == PlacementPolicy::Detached); + if shard.policy != PlacementPolicy::Detached { + tracing::error!( + "Tenant shard {} loaded on-demand, but has non-Detached policy {:?}", + shard.tenant_shard_id, + shard.policy + ); + } + + (shard.tenant_shard_id, shard) + })); + + Ok(()) + } + + /// If all shards for a tenant are detached, and in a fully quiescent state (no observed locations on pageservers), + /// and have no reconciler running, then we can drop the tenant from memory. It will be reloaded on-demand + /// if we are asked to attach it again (see [`Self::maybe_load_tenant`]). + /// + /// Caller must demonstrate they hold a lock guard, as otherwise it is unsafe to drop a tenant from + /// memory while some other function might assume it continues to exist while not holding the lock on Self::inner. + fn maybe_drop_tenant( + &self, + tenant_id: TenantId, + locked: &mut std::sync::RwLockWriteGuard, + _guard: &TracingExclusiveGuard, + ) { + let mut tenant_shards = locked.tenants.range(TenantShardId::tenant_range(tenant_id)); + if tenant_shards.all(|(_id, shard)| { + shard.policy == PlacementPolicy::Detached + && shard.reconciler.is_none() + && shard.observed.is_empty() + }) { + let keys = locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .map(|(id, _)| id) + .copied() + .collect::>(); + for key in keys { + tracing::info!("Dropping detached tenant shard {} from memory", key); + locked.tenants.remove(&key); + } + } + } + /// This API is used by the cloud control plane to migrate unsharded tenants that it created /// directly with pageservers into this service. /// @@ -2462,14 +2582,26 @@ impl Service { ) .await; - if !tenant_shard_id.is_unsharded() { + let tenant_id = if !tenant_shard_id.is_unsharded() { return Err(ApiError::BadRequest(anyhow::anyhow!( "This API is for importing single-sharded or unsharded tenants" ))); - } + } else { + tenant_shard_id.tenant_id + }; + + // In case we are waking up a Detached tenant + match self.maybe_load_tenant(tenant_id, &_tenant_lock).await { + Ok(()) | Err(ApiError::NotFound(_)) => { + // This is a creation or an update + } + Err(e) => { + return Err(e); + } + }; // First check if this is a creation or an update - let create_or_update = self.tenant_location_config_prepare(tenant_shard_id.tenant_id, req); + let create_or_update = self.tenant_location_config_prepare(tenant_id, req); let mut result = TenantLocationConfigResponse { shards: Vec::new(), @@ -2492,6 +2624,7 @@ impl Service { // Persist updates // Ordering: write to the database before applying changes in-memory, so that // we will not appear time-travel backwards on a restart. + let mut schedule_context = ScheduleContext::default(); for ShardUpdate { tenant_shard_id, @@ -2596,6 +2729,8 @@ impl Service { let tenant_id = req.tenant_id; let patch = req.config; + self.maybe_load_tenant(tenant_id, &_tenant_lock).await?; + let base = { let locked = self.inner.read().unwrap(); let shards = locked @@ -2640,19 +2775,7 @@ impl Service { ) .await; - let tenant_exists = { - let locked = self.inner.read().unwrap(); - let mut r = locked - .tenants - .range(TenantShardId::tenant_range(req.tenant_id)); - r.next().is_some() - }; - - if !tenant_exists { - return Err(ApiError::NotFound( - anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(), - )); - } + self.maybe_load_tenant(req.tenant_id, &_tenant_lock).await?; self.set_tenant_config_and_reconcile(req.tenant_id, req.config) .await @@ -2945,6 +3068,8 @@ impl Service { let _tenant_lock = trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await; + self.maybe_load_tenant(tenant_id, &_tenant_lock).await?; + // Detach all shards. This also deletes local pageserver shard data. let (detach_waiters, node) = { let mut detach_waiters = Vec::new(); @@ -3064,6 +3189,8 @@ impl Service { ) .await; + self.maybe_load_tenant(tenant_id, &_tenant_lock).await?; + failpoint_support::sleep_millis_async!("tenant-update-policy-exclusive-lock"); let TenantPolicyRequest { @@ -5146,11 +5273,13 @@ impl Service { ))); } - let mut shards = self.persistence.list_tenant_shards().await?; - shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); + let mut persistent_shards = self.persistence.load_active_tenant_shards().await?; + persistent_shards + .sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); + expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); - if shards != expect_shards { + if persistent_shards != expect_shards { tracing::error!("Consistency check failed on shards."); tracing::error!( "Shards in memory: {}", @@ -5159,7 +5288,7 @@ impl Service { ); tracing::error!( "Shards in database: {}", - serde_json::to_string(&shards) + serde_json::to_string(&persistent_shards) .map_err(|e| ApiError::InternalServerError(e.into()))? ); return Err(ApiError::InternalServerError(anyhow::anyhow!( @@ -6115,6 +6244,10 @@ impl Service { let mut pending_reconciles = 0; let mut az_violations = 0; + // If we find any tenants to drop from memory, stash them to offload after + // we're done traversing the map of tenants. + let mut drop_detached_tenants = Vec::new(); + let mut reconciles_spawned = 0; for shard in tenants.values_mut() { // Accumulate scheduling statistics @@ -6148,6 +6281,25 @@ impl Service { // Shard wanted to reconcile but for some reason couldn't. pending_reconciles += 1; } + + // If this tenant is detached, try dropping it from memory. This is usually done + // proactively in [`Self::process_results`], but we do it here to handle the edge + // case where a reconcile completes while someone else is holding an op lock for the tenant. + if shard.tenant_shard_id.shard_number == ShardNumber(0) + && shard.policy == PlacementPolicy::Detached + { + if let Some(guard) = self.tenant_op_locks.try_exclusive( + shard.tenant_shard_id.tenant_id, + TenantOperations::DropDetached, + ) { + drop_detached_tenants.push((shard.tenant_shard_id.tenant_id, guard)); + } + } + } + + // Process any deferred tenant drops + for (tenant_id, guard) in drop_detached_tenants { + self.maybe_drop_tenant(tenant_id, &mut locked, &guard); } metrics::METRICS_REGISTRY @@ -7198,13 +7350,12 @@ impl Service { pub(crate) async fn safekeepers_list( &self, ) -> Result, DatabaseError> { - Ok(self - .persistence + self.persistence .list_safekeepers() .await? .into_iter() .map(|v| v.as_describe_response()) - .collect::>()) + .collect::, _>>() } pub(crate) async fn get_safekeeper( @@ -7214,12 +7365,12 @@ impl Service { self.persistence .safekeeper_get(id) .await - .map(|v| v.as_describe_response()) + .and_then(|v| v.as_describe_response()) } pub(crate) async fn upsert_safekeeper( &self, - record: crate::persistence::SafekeeperPersistence, + record: crate::persistence::SafekeeperUpsert, ) -> Result<(), DatabaseError> { self.persistence.safekeeper_upsert(record).await } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index cba579e8a7..c17989a316 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -465,6 +465,10 @@ impl ObservedState { locations: HashMap::new(), } } + + pub(crate) fn is_empty(&self) -> bool { + self.locations.is_empty() + } } impl TenantShard { diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 32c86052ef..b42709868b 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -1,4 +1,5 @@ use std::collections::{HashMap, HashSet}; +use std::time::SystemTime; use itertools::Itertools; use pageserver::tenant::checks::check_valid_layermap; @@ -88,9 +89,14 @@ pub(crate) async fn branch_cleanup_and_check_errors( match s3_data.blob_data { BlobDataParseResult::Parsed { index_part, - index_part_generation: _index_part_generation, - s3_layers: _s3_layers, + index_part_generation: _, + s3_layers: _, + index_part_last_modified_time, + index_part_snapshot_time, } => { + // Ignore missing file error if index_part downloaded is different from the one when listing the layer files. + let ignore_error = index_part_snapshot_time < index_part_last_modified_time + && !cfg!(debug_assertions); if !IndexPart::KNOWN_VERSIONS.contains(&index_part.version()) { result .errors @@ -171,7 +177,7 @@ pub(crate) async fn branch_cleanup_and_check_errors( is_l0, ); - if is_l0 { + if is_l0 || ignore_error { result.warnings.push(msg); } else { result.errors.push(msg); @@ -308,6 +314,8 @@ pub(crate) enum BlobDataParseResult { Parsed { index_part: Box, index_part_generation: Generation, + index_part_last_modified_time: SystemTime, + index_part_snapshot_time: SystemTime, s3_layers: HashSet<(LayerName, Generation)>, }, /// The remains of an uncleanly deleted Timeline or aborted timeline creation(e.g. an initdb archive only, or some layer without an index) @@ -484,9 +492,9 @@ async fn list_timeline_blobs_impl( } if let Some(index_part_object_key) = index_part_object.as_ref() { - let index_part_bytes = + let (index_part_bytes, index_part_last_modified_time) = match download_object_with_retries(remote_client, &index_part_object_key.key).await { - Ok(index_part_bytes) => index_part_bytes, + Ok(data) => data, Err(e) => { // It is possible that the branch gets deleted in-between we list the objects // and we download the index part file. @@ -500,7 +508,7 @@ async fn list_timeline_blobs_impl( )); } }; - + let index_part_snapshot_time = index_part_object_key.last_modified; match serde_json::from_slice(&index_part_bytes) { Ok(index_part) => { return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData { @@ -508,6 +516,8 @@ async fn list_timeline_blobs_impl( index_part: Box::new(index_part), index_part_generation, s3_layers, + index_part_last_modified_time, + index_part_snapshot_time, }, unused_index_keys: index_part_keys, unknown_keys, @@ -625,7 +635,7 @@ pub(crate) async fn list_tenant_manifests( let manifest_bytes = match download_object_with_retries(remote_client, &latest_listing_object.key).await { - Ok(bytes) => bytes, + Ok((bytes, _)) => bytes, Err(e) => { // It is possible that the tenant gets deleted in-between we list the objects // and we download the manifest file. diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index be526daaf0..224235098c 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -13,7 +13,7 @@ pub mod tenant_snapshot; use std::env; use std::fmt::Display; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, SystemTime}; use anyhow::Context; use aws_config::retry::{RetryConfigBuilder, RetryMode}; @@ -509,10 +509,11 @@ async fn list_objects_with_retries( panic!("MAX_RETRIES is not allowed to be 0"); } +/// Returns content, last modified time async fn download_object_with_retries( remote_client: &GenericRemoteStorage, key: &RemotePath, -) -> anyhow::Result> { +) -> anyhow::Result<(Vec, SystemTime)> { let cancel = CancellationToken::new(); for trial in 0..MAX_RETRIES { let mut buf = Vec::new(); @@ -535,7 +536,7 @@ async fn download_object_with_retries( { Ok(bytes_read) => { tracing::debug!("Downloaded {bytes_read} bytes for object {key}"); - return Ok(buf); + return Ok((buf, download.last_modified)); } Err(e) => { error!("Failed to stream object body for key {key}: {e}"); diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index d19b8a5f91..a997373375 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -450,6 +450,8 @@ async fn gc_ancestor( index_part: _, index_part_generation: _, s3_layers, + index_part_last_modified_time: _, + index_part_snapshot_time: _, } => s3_layers, BlobDataParseResult::Relic => { // Post-deletion tenant location: don't try and GC it. @@ -586,7 +588,9 @@ async fn gc_timeline( BlobDataParseResult::Parsed { index_part, index_part_generation, - s3_layers: _s3_layers, + s3_layers: _, + index_part_last_modified_time: _, + index_part_snapshot_time: _, } => (index_part, *index_part_generation, data.unused_index_keys), BlobDataParseResult::Relic => { // Post-deletion tenant location: don't try and GC it. diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index c8de6e46b3..a31fb5b242 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -47,6 +47,8 @@ impl MetadataSummary { index_part, index_part_generation: _, s3_layers: _, + index_part_last_modified_time: _, + index_part_snapshot_time: _, } = &data.blob_data { *self @@ -195,7 +197,9 @@ pub async fn scan_pageserver_metadata( if let BlobDataParseResult::Parsed { index_part, index_part_generation, - s3_layers: _s3_layers, + s3_layers: _, + index_part_last_modified_time: _, + index_part_snapshot_time: _, } = &data.blob_data { if index_part.deleted_at.is_some() { @@ -318,9 +322,11 @@ pub async fn scan_pageserver_metadata( match &data.blob_data { BlobDataParseResult::Parsed { - index_part: _index_part, + index_part: _, index_part_generation: _index_part_generation, s3_layers, + index_part_last_modified_time: _, + index_part_snapshot_time: _, } => { tenant_objects.push(ttid, s3_layers.clone()); } diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 39e0b5c9b4..60e79fb859 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -268,6 +268,8 @@ impl SnapshotDownloader { index_part, index_part_generation, s3_layers: _, + index_part_last_modified_time: _, + index_part_snapshot_time: _, } => { self.download_timeline( ttid, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8fd9eec8ce..e22e452a52 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2521,6 +2521,7 @@ class NeonPageserver(PgProtocol, LogUtils): self, extra_env_vars: dict[str, str] | None = None, timeout_in_seconds: int | None = None, + await_active: bool = True, ) -> Self: """ Start the page server. @@ -2547,8 +2548,10 @@ class NeonPageserver(PgProtocol, LogUtils): ) self.running = True - if self.env.storage_controller.running and self.env.storage_controller.node_registered( - self.id + if ( + await_active + and self.env.storage_controller.running + and self.env.storage_controller.node_registered(self.id) ): self.env.storage_controller.poll_node_status( self.id, PageserverAvailability.ACTIVE, None, max_attempts=200, backoff=0.1 @@ -4930,13 +4933,30 @@ def check_restored_datadir_content( assert (mismatch, error) == ([], []) -def logical_replication_sync(subscriber: PgProtocol, publisher: PgProtocol) -> Lsn: +def logical_replication_sync( + subscriber: PgProtocol, + publisher: PgProtocol, + sub_dbname: str | None = None, + pub_dbname: str | None = None, +) -> Lsn: """Wait logical replication subscriber to sync with publisher.""" - publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + if pub_dbname is not None: + publisher_lsn = Lsn( + publisher.safe_psql("SELECT pg_current_wal_flush_lsn()", dbname=pub_dbname)[0][0] + ) + else: + publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + while True: - res = subscriber.safe_psql("select latest_end_lsn from pg_catalog.pg_stat_subscription")[0][ - 0 - ] + if sub_dbname is not None: + res = subscriber.safe_psql( + "select latest_end_lsn from pg_catalog.pg_stat_subscription", dbname=sub_dbname + )[0][0] + else: + res = subscriber.safe_psql( + "select latest_end_lsn from pg_catalog.pg_stat_subscription" + )[0][0] + if res: log.info(f"subscriber_lsn={res}") subscriber_lsn = Lsn(res) diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/performance/test_parallel_copy.py similarity index 100% rename from test_runner/regress/test_parallel_copy.py rename to test_runner/performance/test_parallel_copy.py diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index ba7305148f..a6eaaf6c4c 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -141,11 +141,18 @@ def test_create_snapshot( neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + # Miniature layers to enable generating non-trivial layer map without writing lots of data + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + } + ) endpoint = env.endpoints.create_start("main") - pg_bin.run_capture(["pgbench", "--initialize", "--scale=10", endpoint.connstr()]) - pg_bin.run_capture(["pgbench", "--time=60", "--progress=2", endpoint.connstr()]) + pg_bin.run_capture(["pgbench", "--initialize", "--scale=1", endpoint.connstr()]) + pg_bin.run_capture(["pgbench", "--time=30", "--progress=2", endpoint.connstr()]) pg_bin.run_capture( ["pg_dumpall", f"--dbname={endpoint.connstr()}", f"--file={test_output_dir / 'dump.sql'}"] ) @@ -157,7 +164,9 @@ def test_create_snapshot( pageserver_http = env.pageserver.http_client() flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) - pageserver_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + pageserver_http.timeline_checkpoint( + tenant_id, timeline_id, wait_until_uploaded=True, force_image_layer_creation=True + ) env.endpoints.stop_all() for sk in env.safekeepers: diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index b3719a45ed..f0878b2631 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -1,7 +1,9 @@ from __future__ import annotations +import logging + import requests -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import NeonEnv, logical_replication_sync TEST_DB_NAMES = [ { @@ -136,3 +138,230 @@ def test_compute_create_databases(neon_simple_env: NeonEnv): assert curr_db is not None assert len(curr_db) == 1 assert curr_db[0] == db["name"] + + +def test_dropdb_with_subscription(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can drop a database that has a logical replication subscription. + """ + env = neon_simple_env + + # Create and start endpoint so that neon_local put all the generated + # stuff into the spec.json file. + endpoint = env.endpoints.create_start("main") + + TEST_DB_NAMES = [ + { + "name": "neondb", + "owner": "cloud_admin", + }, + { + "name": "subscriber_db", + "owner": "cloud_admin", + }, + { + "name": "publisher_db", + "owner": "cloud_admin", + }, + ] + + # Update the spec.json file to create the databases + # and reconfigure the endpoint to apply the changes. + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES, + }, + } + ) + endpoint.reconfigure() + + # connect to the publisher_db and create a publication + with endpoint.cursor(dbname="publisher_db") as cursor: + cursor.execute("CREATE PUBLICATION mypub FOR ALL TABLES") + cursor.execute("select pg_catalog.pg_create_logical_replication_slot('mysub', 'pgoutput');") + cursor.execute("CREATE TABLE t(a int)") + cursor.execute("INSERT INTO t VALUES (1)") + + # connect to the subscriber_db and create a subscription + # Note that we need to create subscription with + connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''") + with endpoint.cursor(dbname="subscriber_db") as cursor: + cursor.execute("CREATE TABLE t(a int)") + cursor.execute( + f"CREATE SUBSCRIPTION mysub CONNECTION '{connstr}' PUBLICATION mypub WITH (create_slot = false) " + ) + + # wait for the subscription to be active + logical_replication_sync( + endpoint, endpoint, sub_dbname="subscriber_db", pub_dbname="publisher_db" + ) + + # Check that replication is working + with endpoint.cursor(dbname="subscriber_db") as cursor: + cursor.execute("SELECT * FROM t") + rows = cursor.fetchall() + assert len(rows) == 1 + assert rows[0][0] == 1 + + # drop the subscriber_db from the list + TEST_DB_NAMES_NEW = [ + { + "name": "neondb", + "owner": "cloud_admin", + }, + { + "name": "publisher_db", + "owner": "cloud_admin", + }, + ] + # Update the spec.json file to drop the database + # and reconfigure the endpoint to apply the changes. + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES_NEW, + }, + "delta_operations": [ + {"action": "delete_db", "name": "subscriber_db"}, + # also test the case when we try to delete a non-existent database + # shouldn't happen in normal operation, + # but can occur when failed operations are retried + {"action": "delete_db", "name": "nonexistent_db"}, + ], + } + ) + + logging.info("Reconfiguring the endpoint to drop the subscriber_db") + endpoint.reconfigure() + + # Check that the subscriber_db is dropped + with endpoint.cursor() as cursor: + cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", ("subscriber_db",)) + catalog_db = cursor.fetchone() + assert catalog_db is None + + # Check that we can still connect to the publisher_db + with endpoint.cursor(dbname="publisher_db") as cursor: + cursor.execute("SELECT * FROM current_database()") + curr_db = cursor.fetchone() + assert curr_db is not None + assert len(curr_db) == 1 + assert curr_db[0] == "publisher_db" + + +def test_compute_drop_role(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can drop a role even if it has some depending objects + like permissions in one of the databases. + Reproduction test for https://github.com/neondatabase/cloud/issues/13582 + """ + env = neon_simple_env + TEST_DB_NAME = "db_with_permissions" + + endpoint = env.endpoints.create_start("main") + + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "roles": [ + { + # We need to create role via compute_ctl, because in this case it will receive + # additional grants equivalent to our real environment, so we can repro some + # issues. + "name": "neon", + # Some autocomplete-suggested hash, no specific meaning. + "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=", + "options": [], + }, + ], + "databases": [ + { + "name": TEST_DB_NAME, + "owner": "neon", + }, + ], + }, + } + ) + endpoint.reconfigure() + + with endpoint.cursor(dbname=TEST_DB_NAME) as cursor: + # Create table and view as `cloud_admin`. This is the case when, for example, + # PostGIS extensions creates tables in `public` schema. + cursor.execute("create table test_table (id int)") + cursor.execute("create view test_view as select * from test_table") + + with endpoint.cursor(dbname=TEST_DB_NAME, user="neon") as cursor: + cursor.execute("create role readonly") + # We (`compute_ctl`) make 'neon' the owner of schema 'public' in the owned database. + # Postgres has all sorts of permissions and grants that we may not handle well, + # but this is the shortest repro grant for the issue + # https://github.com/neondatabase/cloud/issues/13582 + cursor.execute("grant select on all tables in schema public to readonly") + + # Check that role was created + with endpoint.cursor() as cursor: + cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'") + role = cursor.fetchone() + assert role is not None + + # Confirm that we actually have some permissions for 'readonly' role + # that may block our ability to drop the role. + with endpoint.cursor(dbname=TEST_DB_NAME) as cursor: + cursor.execute( + "select grantor from information_schema.role_table_grants where grantee = 'readonly'" + ) + res = cursor.fetchall() + assert len(res) == 2, f"Expected 2 table grants, got {len(res)}" + for row in res: + assert row[0] == "neon_superuser" + + # Drop role via compute_ctl + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "delta_operations": [ + { + "action": "delete_role", + "name": "readonly", + }, + ], + } + ) + endpoint.reconfigure() + + # Check that role is dropped + with endpoint.cursor() as cursor: + cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'") + role = cursor.fetchone() + assert role is None + + # + # Drop schema 'public' and check that we can still drop the role + # + with endpoint.cursor(dbname=TEST_DB_NAME) as cursor: + cursor.execute("create role readonly2") + cursor.execute("grant select on all tables in schema public to readonly2") + cursor.execute("drop schema public cascade") + + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "delta_operations": [ + { + "action": "delete_role", + "name": "readonly2", + }, + ], + } + ) + endpoint.reconfigure() + + with endpoint.cursor() as cursor: + cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly2'") + role = cursor.fetchone() + assert role is None diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 377b0fb4d4..8762e6525b 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -30,7 +30,7 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): ], ) n_resize = 10 - scale = 100 + scale = 20 def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") @@ -46,17 +46,36 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): conn = endpoint.connect() cur = conn.cursor() + def get_lfc_size() -> tuple[int, int]: + lfc_file_path = endpoint.lfc_path() + lfc_file_size = lfc_file_path.stat().st_size + res = subprocess.run( + ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True + ) + lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0] + log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}") + + return (lfc_file_size, lfc_file_blocks) + # For as long as pgbench is running, twiddle the LFC size once a second. # Note that we launch this immediately, already while the "pgbench -i" # initialization step is still running. That's quite a different workload # than the actual pgbench benchamark run, so this gives us coverage of both. while thread.is_alive(): - size = random.randint(1, 512) + # Vary the LFC size randomly within a range above what we will later + # decrease it to. This should ensure that the final size decrease + # is really doing something. + size = random.randint(192, 512) cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'") cur.execute("select pg_reload_conf()") time.sleep(1) + thread.join() + # Before shrinking the cache, check that it really is large now + (lfc_file_size, lfc_file_blocks) = get_lfc_size() + assert int(lfc_file_blocks) > 128 * 1024 + # At the end, set it at 100 MB, and perform a final check that the disk usage # of the file is in that ballbark. # @@ -66,13 +85,7 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): cur.execute("select pg_reload_conf()") nretries = 10 while True: - lfc_file_path = endpoint.lfc_path() - lfc_file_size = lfc_file_path.stat().st_size - res = subprocess.run( - ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True - ) - lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0] - log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}") + (lfc_file_size, lfc_file_blocks) = get_lfc_size() assert lfc_file_size <= 512 * 1024 * 1024 if int(lfc_file_blocks) <= 128 * 1024 or nretries == 0: diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index 94c630ffcf..21c9e97a42 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -29,8 +29,8 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder): cur = endpoint.connect().cursor() stop = threading.Event() - n_rows = 100000 - n_threads = 20 + n_rows = 10000 + n_threads = 5 n_updates_per_connection = 1000 cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)") diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 4c381b563f..673904a1cd 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -561,11 +561,17 @@ def test_sharding_split_smoke( workload.write_rows(256) # Note which pageservers initially hold a shard after tenant creation - pre_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)] - log.info("Pre-split pageservers: {pre_split_pageserver_ids}") + pre_split_pageserver_ids = dict() + for loc in env.storage_controller.locate(tenant_id): + shard_no = TenantShardId.parse(loc["shard_id"]).shard_number + pre_split_pageserver_ids[loc["node_id"]] = shard_no + log.info(f"Pre-split pageservers: {pre_split_pageserver_ids}") # For pageservers holding a shard, validate their ingest statistics # reflect a proper splitting of the WAL. + + observed_on_shard_zero = 0 + received_on_non_zero_shard = 0 for pageserver in env.pageservers: if pageserver.id not in pre_split_pageserver_ids: continue @@ -573,28 +579,38 @@ def test_sharding_split_smoke( metrics = pageserver.http_client().get_metrics_values( [ "pageserver_wal_ingest_records_received_total", - "pageserver_wal_ingest_records_committed_total", - "pageserver_wal_ingest_records_filtered_total", + "pageserver_wal_ingest_records_observed_total", ] ) log.info(f"Pageserver {pageserver.id} metrics: {metrics}") - # Not everything received was committed - assert ( - metrics["pageserver_wal_ingest_records_received_total"] - > metrics["pageserver_wal_ingest_records_committed_total"] - ) + received = metrics["pageserver_wal_ingest_records_received_total"] + observed = metrics["pageserver_wal_ingest_records_observed_total"] - # Something was committed - assert metrics["pageserver_wal_ingest_records_committed_total"] > 0 + shard_number: int | None = pre_split_pageserver_ids.get(pageserver.id, None) + if shard_number is None: + assert received == 0 + assert observed == 0 + elif shard_number == 0: + # Shard 0 receives its own records and observes records of other shards + # for relation size tracking. + assert observed > 0 + assert received > 0 + observed_on_shard_zero = int(observed) + else: + # Non zero shards do not observe any records, but only receive their own. + assert observed == 0 + assert received > 0 + received_on_non_zero_shard += int(received) - # Counts are self consistent - assert ( - metrics["pageserver_wal_ingest_records_received_total"] - == metrics["pageserver_wal_ingest_records_committed_total"] - + metrics["pageserver_wal_ingest_records_filtered_total"] - ) + # Some records are sent to multiple shards and some shard 0 records include both value observations + # and other metadata. Hence, we do a sanity check below that shard 0 observes the majority of records + # received by other shards. + assert ( + observed_on_shard_zero <= received_on_non_zero_shard + and observed_on_shard_zero >= received_on_non_zero_shard // 2 + ) # TODO: validate that shards have different sizes @@ -633,7 +649,7 @@ def test_sharding_split_smoke( # We should have split into 8 shards, on the same 4 pageservers we started on. assert len(post_split_pageserver_ids) == split_shard_count assert len(set(post_split_pageserver_ids)) == shard_count - assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids) + assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids.keys()) # The old parent shards should no longer exist on disk assert not shards_on_disk(old_shard_ids) @@ -739,7 +755,7 @@ def test_sharding_split_smoke( # all the pageservers that originally held an attached shard should still hold one, otherwise # it would indicate that we had done some unnecessary migration. log.info(f"attached: {attached}") - for ps_id in pre_split_pageserver_ids: + for ps_id in pre_split_pageserver_ids.keys(): log.info("Pre-split pageserver {ps_id} should still hold an attached location") assert ps_id in attached diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 7062c35e05..da6d5b8622 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -17,6 +17,7 @@ from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( DEFAULT_AZ_ID, + LogCursor, NeonEnv, NeonEnvBuilder, NeonPageserver, @@ -2406,7 +2407,14 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): env.storage_controller.tenant_create(tid) env.storage_controller.reconcile_until_idle() - env.storage_controller.configure_failpoints(("sleep-on-reconcile-epilogue", "return(10000)")) + env.storage_controller.configure_failpoints(("reconciler-epilogue", "pause")) + + def unpause_failpoint(): + time.sleep(2) + env.storage_controller.configure_failpoints(("reconciler-epilogue", "off")) + + thread = threading.Thread(target=unpause_failpoint) + thread.start() # Make a change to the tenant config to trigger a slow reconcile virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) @@ -2421,6 +2429,8 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): observed_state = env.storage_controller.step_down() log.info(f"Storage controller stepped down with {observed_state=}") + thread.join() + # Validate that we waited for the slow reconcile to complete # and updated the observed state in the storcon before stepping down. node_id = str(env.pageserver.id) @@ -3009,7 +3019,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: compared = [dict(a), dict(b)] - masked_keys = ["created_at", "updated_at", "active"] + masked_keys = ["created_at", "updated_at", "active", "scheduling_policy"] for d in compared: # keep deleting these in case we are comparing the body as it will be uploaded by real scripts @@ -3289,8 +3299,221 @@ def test_storage_controller_detached_stopped( "generation": None, }, ) - + env.storage_controller.reconcile_until_idle() env.storage_controller.consistency_check() # Confirm the detach happened assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == [] + + +@run_only_on_default_postgres("Postgres version makes no difference here") +def test_storage_controller_detach_lifecycle( + neon_env_builder: NeonEnvBuilder, +): + """ + Test that detached tenants are handled properly through their lifecycle: getting dropped + from memory when detached, then getting loaded back on-demand. + """ + + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + neon_env_builder.num_pageservers = 1 + + env = neon_env_builder.init_configs() + env.start() + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.storage_controller.tenant_create( + tenant_id, + shard_count=1, + ) + virtual_ps_http.timeline_create(PgVersion.NOT_SET, tenant_id, timeline_id) + + remote_prefix = "/".join( + ( + "tenants", + str(tenant_id), + ) + ) + # We will later check data is gone after deletion, so as a control check that it is present to begin with + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix=remote_prefix, + ) + + assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 + assert len(env.storage_controller.tenant_list()) == 1 + + # Detach the tenant + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + # Ensure reconciles are done (the one we do inline in location_conf is advisory and if it takes too long that API just succeeds anyway) + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check() + + # Confirm the detach happened on pageserver + assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == [] + # Confirm the tenant is not in memory on the controller + assert env.storage_controller.tenant_list() == [] + + # The detached tenant does not get loaded into memory across a controller restart + env.storage_controller.stop() + env.storage_controller.start() + assert env.storage_controller.tenant_list() == [] + env.storage_controller.consistency_check() + + # The detached tenant can be re-attached + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 + assert len(env.storage_controller.tenant_list()) == 1 + env.storage_controller.consistency_check() + + # Detach it again before doing deletion + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check() + + # A detached tenant can be deleted + virtual_ps_http.tenant_delete(tenant_id) + + # Such deletions really work (empty remote storage) + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix=remote_prefix, + ) + + +@run_only_on_default_postgres("Postgres version makes no difference here") +def test_storage_controller_node_flap_detach_race( + neon_env_builder: NeonEnvBuilder, +): + """ + Reproducer for https://github.com/neondatabase/neon/issues/10253. + + When a node's availability flaps, the reconciliations spawned by the node + going offline may race with the reconciliation done when then node comes + back online. + """ + neon_env_builder.num_pageservers = 4 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create( + tenant_id, + shard_count=2, + ) + env.storage_controller.reconcile_until_idle() + + stopped_nodes = [s["node_id"] for s in env.storage_controller.locate(tenant_id)] + + def has_hit_failpoint(failpoint: str, offset: LogCursor | None = None) -> LogCursor: + res = env.storage_controller.log_contains(f"at failpoint {failpoint}", offset=offset) + assert res + return res[1] + + # Stop the nodes which host attached shards. + # This will trigger reconciliations which pause before incrmenenting the generation, + # and, more importantly, updating the `generation_pageserver` of the shards. + env.storage_controller.configure_failpoints(("reconciler-pre-increment-generation", "pause")) + for node_id in stopped_nodes: + env.get_pageserver(node_id).stop(immediate=True) + + def failure_handled() -> LogCursor: + stop_offset = None + + for node_id in stopped_nodes: + res = env.storage_controller.log_contains(f"node {node_id} going offline") + assert res + stop_offset = res[1] + + assert stop_offset + return stop_offset + + offset = wait_until(failure_handled) + + # Now restart the nodes and make them pause before marking themselves as available + # or running the activation reconciliation. + env.storage_controller.configure_failpoints(("heartbeat-pre-node-state-configure", "pause")) + + for node_id in stopped_nodes: + env.get_pageserver(node_id).start(await_active=False) + + offset = wait_until( + lambda: has_hit_failpoint("heartbeat-pre-node-state-configure", offset=offset) + ) + + # The nodes have restarted and are waiting to perform activaction reconciliation. + # Unpause the initial reconciliation triggered by the nodes going offline. + # It will attempt to detach from the old location, but notice that the old location + # is not yet available, and then stop before processing the results of the reconciliation. + env.storage_controller.configure_failpoints(("reconciler-epilogue", "pause")) + env.storage_controller.configure_failpoints(("reconciler-pre-increment-generation", "off")) + + offset = wait_until(lambda: has_hit_failpoint("reconciler-epilogue", offset=offset)) + + # Let the nodes perform activation reconciliation while still holding up processing the result + # from the initial reconcile triggered by going offline. + env.storage_controller.configure_failpoints(("heartbeat-pre-node-state-configure", "off")) + + def activate_reconciliation_done(): + for node_id in stopped_nodes: + assert env.storage_controller.log_contains( + f"Node {node_id} transition to active", offset=offset + ) + + wait_until(activate_reconciliation_done) + + # Finally, allow the initial reconcile to finish up. + env.storage_controller.configure_failpoints(("reconciler-epilogue", "off")) + + # Give things a chance to settle and validate that no stale locations exist + env.storage_controller.reconcile_until_idle() + + def validate_locations(): + shard_locations = defaultdict(list) + for ps in env.pageservers: + locations = ps.http_client().tenant_list_locations()["tenant_shards"] + for loc in locations: + shard_locations[loc[0]].append( + {"generation": loc[1]["generation"], "mode": loc[1]["mode"], "node": ps.id} + ) + + log.info(f"Shard locations: {shard_locations}") + + attached_locations = { + k: list(filter(lambda loc: loc["mode"] == "AttachedSingle", v)) + for k, v in shard_locations.items() + } + + for shard, locs in attached_locations.items(): + assert len(locs) == 1, f"{shard} has {len(locs)} attached locations" + + wait_until(validate_locations, timeout=10) diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 9b3a48add9..bec8270582 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -959,3 +959,103 @@ def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder): assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] > 0 assert gc_summary["tenant_manifests_deleted"] > 0 + + +@pytest.mark.parametrize("end_with_offloaded", [False, True]) +def test_timeline_offload_race_unarchive( + neon_env_builder: NeonEnvBuilder, end_with_offloaded: bool +): + """ + Ensure that unarchive and timeline offload don't race each other + """ + # Regression test for issue https://github.com/neondatabase/neon/issues/10220 + # (automatic) timeline offloading defaults to false for now + neon_env_builder.pageserver_config_override = "timeline_offloading = true" + + failpoint = "before-timeline-auto-offload" + + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + # Turn off gc and compaction loops: we want to issue them manually for better reliability + tenant_id, initial_timeline_id = env.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "1s", + } + ) + + # Create a branch + leaf_timeline_id = env.create_branch("test_ancestor_branch_archive", tenant_id) + + # write some stuff to the leaf + with env.endpoints.create_start( + "test_ancestor_branch_archive", tenant_id=tenant_id + ) as endpoint: + endpoint.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'data_content')", + "INSERT INTO foo SELECT FROM generate_series(1,1000)", + ] + ) + sum = endpoint.safe_psql("SELECT sum(key) from foo where key % 7 = 1") + + ps_http.configure_failpoints((failpoint, "pause")) + + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + tenant_id, + leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is True + + # The actual race: get the compaction task to right before + # offloading the timeline and attempt to unarchive it + wait_until(lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")) + + # This unarchival should go through + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + + def timeline_offloaded_api(timeline_id: TimelineId) -> bool: + # TODO add a proper API to check if a timeline has been offloaded or not + return not any( + timeline["timeline_id"] == str(timeline_id) + for timeline in ps_http.timeline_list(tenant_id=tenant_id) + ) + + def leaf_offloaded(): + assert timeline_offloaded_api(leaf_timeline_id) + + # Ensure that we've hit the failed offload attempt + ps_http.configure_failpoints((failpoint, "off")) + wait_until( + lambda: env.pageserver.assert_log_contains( + f".*compaction_loop.*offload_timeline.*{leaf_timeline_id}.*can't shut down timeline.*" + ) + ) + + with env.endpoints.create_start( + "test_ancestor_branch_archive", tenant_id=tenant_id + ) as endpoint: + sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key % 7 = 1") + assert sum == sum_again + + if end_with_offloaded: + # Ensure that offloading still works after all of this + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + wait_until(leaf_offloaded) + else: + # Test that deletion of leaf timeline works + ps_http.timeline_delete(tenant_id, leaf_timeline_id) diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 33bdc25785..0ffeeead18 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -91,7 +91,8 @@ tokio-stream = { version = "0.1", features = ["net"] } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } toml_edit = { version = "0.22", features = ["serde"] } tonic = { version = "0.12", features = ["tls-roots"] } -tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "util"] } +tower-9fbad63c4bcf4a8f = { package = "tower", version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] } +tower-d8f496e17d97b5cb = { package = "tower", version = "0.5", default-features = false, features = ["log", "make", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } url = { version = "2", features = ["serde"] }