diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 4ad8a7b460..1b602883c5 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -7,6 +7,13 @@ self-hosted-runner: - small-arm64 - us-east-2 config-variables: + - AZURE_DEV_CLIENT_ID + - AZURE_DEV_REGISTRY_NAME + - AZURE_DEV_SUBSCRIPTION_ID + - AZURE_PROD_CLIENT_ID + - AZURE_PROD_REGISTRY_NAME + - AZURE_PROD_SUBSCRIPTION_ID + - AZURE_TENANT_ID - BENCHMARK_PROJECT_ID_PUB - BENCHMARK_PROJECT_ID_SUB - REMOTE_STORAGE_AZURE_CONTAINER diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml new file mode 100644 index 0000000000..415b3d9cc6 --- /dev/null +++ b/.github/workflows/_push-to-acr.yml @@ -0,0 +1,56 @@ +name: Push images to ACR +on: + workflow_call: + inputs: + client_id: + description: Client ID of Azure managed identity or Entra app + required: true + type: string + image_tag: + description: Tag for the container image + required: true + type: string + images: + description: Images to push + required: true + type: string + registry_name: + description: Name of the container registry + required: true + type: string + subscription_id: + description: Azure subscription ID + required: true + type: string + tenant_id: + description: Azure tenant ID + required: true + type: string + +jobs: + push-to-acr: + runs-on: ubuntu-22.04 + permissions: + contents: read # This is required for actions/checkout + id-token: write # This is required for Azure Login to work. + + steps: + - name: Azure login + uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 + with: + client-id: ${{ inputs.client_id }} + subscription-id: ${{ inputs.subscription_id }} + tenant-id: ${{ inputs.tenant_id }} + + - name: Login to ACR + run: | + az acr login --name=${{ inputs.registry_name }} + + - name: Copy docker images to ACR ${{ inputs.registry_name }} + run: | + images='${{ inputs.images }}' + for image in ${images}; do + docker buildx imagetools create \ + -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \ + neondatabase/${image}:${{ inputs.image_tag }} + done diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index ee5fd1b0c6..4bb9e5cb66 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -794,9 +794,6 @@ jobs: docker compose -f ./docker-compose/docker-compose.yml down promote-images: - permissions: - contents: read # This is required for actions/checkout - id-token: write # This is required for Azure Login to work. needs: [ check-permissions, tag, test-images, vm-compute-node-image ] runs-on: ubuntu-22.04 @@ -823,28 +820,6 @@ jobs: neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} done - - name: Azure login - if: github.ref_name == 'main' - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 - with: - client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }} - tenant-id: ${{ secrets.AZURE_TENANT_ID }} - subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }} - - - name: Login to ACR - if: github.ref_name == 'main' - run: | - az acr login --name=neoneastus2 - - - name: Copy docker images to ACR-dev - if: github.ref_name == 'main' - run: | - for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do - docker buildx imagetools create \ - -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/${image}:${{ needs.tag.outputs.build-tag }} - done - - name: Add latest tag to images if: github.ref_name == 'main' run: | @@ -882,6 +857,30 @@ jobs: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} done + push-to-acr-dev: + if: github.ref_name == 'main' + needs: [ tag, promote-images ] + uses: ./.github/workflows/_push-to-acr.yml + with: + client_id: ${{ vars.AZURE_DEV_CLIENT_ID }} + image_tag: ${{ needs.tag.outputs.build-tag }} + images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16 + registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} + subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} + tenant_id: ${{ vars.AZURE_TENANT_ID }} + + push-to-acr-prod: + if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' + needs: [ tag, promote-images ] + uses: ./.github/workflows/_push-to-acr.yml + with: + client_id: ${{ vars.AZURE_PROD_CLIENT_ID }} + image_tag: ${{ needs.tag.outputs.build-tag }} + images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16 + registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} + subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} + tenant_id: ${{ vars.AZURE_TENANT_ID }} + trigger-custom-extensions-build-and-wait: needs: [ check-permissions, tag ] runs-on: ubuntu-22.04 @@ -957,8 +956,8 @@ jobs: exit 1 deploy: - needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ] - if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' + needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ] + if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled() runs-on: [ self-hosted, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml index 585d118dfb..b7cbc06a73 100644 --- a/.github/workflows/label-for-external-users.yml +++ b/.github/workflows/label-for-external-users.yml @@ -7,6 +7,11 @@ on: pull_request_target: types: - opened + workflow_dispatch: + inputs: + github-actor: + description: 'GitHub username. If empty, the username of the current user will be used' + required: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} @@ -26,12 +31,31 @@ jobs: id: check-user env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + ACTOR: ${{ inputs.github-actor || github.actor }} run: | - if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then - is_member=true - else - is_member=false - fi + expected_error="User does not exist or is not a member of the organization" + output_file=output.txt + + for i in $(seq 1 10); do + if gh api "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${ACTOR}" \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" > ${output_file}; then + + is_member=true + break + elif grep -q "${expected_error}" ${output_file}; then + is_member=false + break + elif [ $i -eq 10 ]; then + title="Failed to get memmbership status for ${ACTOR}" + message="The latest GitHub API error message: '$(cat ${output_file})'" + echo "::error file=.github/workflows/label-for-external-users.yml,title=${title}::${message}" + + exit 1 + fi + + sleep 1 + done echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT} diff --git a/Cargo.lock b/Cargo.lock index 91917d5351..3ca6acbc3e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -915,25 +915,22 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.65.1" +version = "0.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5" +checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.1", "cexpr", "clang-sys", - "lazy_static", - "lazycell", + "itertools 0.12.1", "log", - "peeking_take_while", - "prettyplease 0.2.6", + "prettyplease 0.2.17", "proc-macro2", "quote", "regex", "rustc-hash", "shlex", "syn 2.0.52", - "which", ] [[package]] @@ -1192,9 +1189,9 @@ dependencies = [ [[package]] name = "comfy-table" -version = "6.1.4" +version = "7.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d" +checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" dependencies = [ "crossterm", "strum", @@ -1249,7 +1246,7 @@ dependencies = [ "tokio-postgres", "tokio-stream", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -1363,8 +1360,8 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-util", - "toml 0.7.4", - "toml_edit 0.19.10", + "toml", + "toml_edit", "tracing", "url", "utils", @@ -1488,25 +1485,22 @@ checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "crossterm" -version = "0.25.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67" +checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.1", "crossterm_winapi", "libc", - "mio", "parking_lot 0.12.1", - "signal-hook", - "signal-hook-mio", "winapi", ] [[package]] name = "crossterm_winapi" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" dependencies = [ "winapi", ] @@ -2949,12 +2943,6 @@ dependencies = [ "spin 0.5.2", ] -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" - [[package]] name = "libc" version = "0.2.150" @@ -3153,7 +3141,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff" dependencies = [ "serde", - "toml 0.8.14", + "toml", ] [[package]] @@ -3669,7 +3657,7 @@ dependencies = [ "thiserror", "tokio", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "utils", "workspace_hack", ] @@ -3756,7 +3744,7 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "tracing", "twox-hash", "url", @@ -3919,8 +3907,9 @@ dependencies = [ [[package]] name = "parquet" -version = "51.0.0" -source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829" +version = "53.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8" dependencies = [ "ahash", "bytes", @@ -3939,8 +3928,9 @@ dependencies = [ [[package]] name = "parquet_derive" -version = "51.0.0" -source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829" +version = "53.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86e9fcfae007533a06b580429a3f7e07cb833ec8aa37c041c16563e7918f057e" dependencies = [ "parquet", "proc-macro2", @@ -3977,12 +3967,6 @@ dependencies = [ "sha2", ] -[[package]] -name = "peeking_take_while" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" - [[package]] name = "pem" version = "3.0.3" @@ -4136,7 +4120,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -4149,7 +4133,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "base64 0.20.0", "byteorder", @@ -4168,7 +4152,7 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -4280,9 +4264,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.6" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" +checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" dependencies = [ "proc-macro2", "syn 2.0.52", @@ -4827,7 +4811,7 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "tracing", "utils", ] @@ -5337,7 +5321,7 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "tracing", "tracing-subscriber", "url", @@ -5746,17 +5730,6 @@ dependencies = [ "signal-hook-registry", ] -[[package]] -name = "signal-hook-mio" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af" -dependencies = [ - "libc", - "mio", - "signal-hook", -] - [[package]] name = "signal-hook-registry" version = "1.4.1" @@ -6069,21 +6042,21 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "strum" -version = "0.24.1" +version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" [[package]] name = "strum_macros" -version = "0.24.3" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck 0.4.1", + "heck 0.5.0", "proc-macro2", "quote", "rustversion", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] @@ -6094,8 +6067,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "svg_fmt" -version = "0.4.2" -source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca" [[package]] name = "syn" @@ -6423,7 +6397,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "async-trait", "byteorder", @@ -6534,18 +6508,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "toml" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec" -dependencies = [ - "serde", - "serde_spanned", - "toml_datetime", - "toml_edit 0.19.10", -] - [[package]] name = "toml" version = "0.8.14" @@ -6555,7 +6517,7 @@ dependencies = [ "serde", "serde_spanned", "toml_datetime", - "toml_edit 0.22.14", + "toml_edit", ] [[package]] @@ -6567,19 +6529,6 @@ dependencies = [ "serde", ] -[[package]] -name = "toml_edit" -version = "0.19.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739" -dependencies = [ - "indexmap 1.9.3", - "serde", - "serde_spanned", - "toml_datetime", - "winnow 0.4.6", -] - [[package]] name = "toml_edit" version = "0.22.14" @@ -6590,7 +6539,7 @@ dependencies = [ "serde", "serde_spanned", "toml_datetime", - "winnow 0.6.13", + "winnow", ] [[package]] @@ -7003,7 +6952,7 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", - "toml_edit 0.19.10", + "toml_edit", "tracing", "tracing-error", "tracing-subscriber", @@ -7549,15 +7498,6 @@ version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" -[[package]] -name = "winnow" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699" -dependencies = [ - "memchr", -] - [[package]] name = "winnow" version = "0.6.13" @@ -7627,6 +7567,7 @@ dependencies = [ "hyper 0.14.26", "indexmap 1.9.3", "itertools 0.10.5", + "itertools 0.12.1", "lazy_static", "libc", "log", @@ -7664,6 +7605,7 @@ dependencies = [ "tokio", "tokio-rustls 0.24.0", "tokio-util", + "toml_edit", "tonic", "tower", "tracing", diff --git a/Cargo.toml b/Cargo.toml index 4fea3e8d80..fd1d4e016c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,7 +64,7 @@ aws-types = "1.2.0" axum = { version = "0.6.20", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" -bindgen = "0.65" +bindgen = "0.70" bit_field = "0.10.2" bstr = "1.0" byteorder = "1.4" @@ -73,7 +73,7 @@ camino = "1.1.6" cfg-if = "1.0.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = { version = "4.0", features = ["derive"] } -comfy-table = "6.1" +comfy-table = "7.1" const_format = "0.2" crc32c = "0.6" crossbeam-deque = "0.8.5" @@ -123,8 +123,8 @@ opentelemetry = "0.20.0" opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions = "0.12.0" parking_lot = "0.12" -parquet = { version = "51.0.0", default-features = false, features = ["zstd"] } -parquet_derive = "51.0.0" +parquet = { version = "53", default-features = false, features = ["zstd"] } +parquet_derive = "53" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" procfs = "0.16" @@ -158,11 +158,10 @@ signal-hook = "0.3" smallvec = "1.11" smol_str = { version = "0.2.0", features = ["serde"] } socket2 = "0.5" -strum = "0.24" -strum_macros = "0.24" +strum = "0.26" +strum_macros = "0.26" "subtle" = "2.5.0" -# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet -svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" } +svg_fmt = "0.4.3" sync_wrapper = "0.1.2" tar = "0.4" task-local-extensions = "0.1.4" @@ -178,8 +177,8 @@ tokio-rustls = "0.25" tokio-stream = "0.1" tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } -toml = "0.7" -toml_edit = "0.19" +toml = "0.8" +toml_edit = "0.22" tonic = {version = "0.9", features = ["tls", "tls-roots"]} tower-service = "0.3.2" tracing = "0.1" @@ -202,10 +201,21 @@ env_logger = "0.10" log = "0.4" ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } + +# We want to use the 'neon' branch for these, but there's currently one +# incompatible change on the branch. See: +# +# - PR #8076 which contained changes that depended on the new changes in +# the rust-postgres crate, and +# - PR #8654 which reverted those changes and made the code in proxy incompatible +# with the tip of the 'neon' branch again. +# +# When those proxy changes are re-applied (see PR #8747), we can switch using +# the tip of the 'neon' branch again. +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } +postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } @@ -242,11 +252,7 @@ tonic-build = "0.9" [patch.crates-io] # Needed to get `tokio-postgres-rustls` to depend on our fork. -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } - -# bug fixes for UUID -parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" } -parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" } ################# Binary contents sections diff --git a/Dockerfile b/Dockerfile index d3d12330c6..1efedfa9bc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -87,6 +87,7 @@ RUN mkdir -p /data/.neon/ && \ "pg_distrib_dir='/usr/local/'\n" \ "listen_pg_addr='0.0.0.0:6400'\n" \ "listen_http_addr='0.0.0.0:9898'\n" \ + "availability_zone='local'\n" \ > /data/.neon/pageserver.toml && \ chown -R neon:neon /data/.neon diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index d6beb61369..c4209c7a12 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -192,7 +192,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.80.1 +ENV RUSTC_VERSION=1.81.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 @@ -207,7 +207,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux export PATH="$HOME/.cargo/bin:$PATH" && \ . "$HOME/.cargo/env" && \ cargo --version && rustup --version && \ - rustup component add llvm-tools-preview rustfmt clippy && \ + rustup component add llvm-tools rustfmt clippy && \ cargo install rustfilt --version ${RUSTFILT_VERSION} && \ cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \ cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \ diff --git a/README.md b/README.md index 735edef0fc..b54956f773 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,12 @@ brew install protobuf openssl flex bison icu4c pkg-config echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc ``` +If you get errors about missing `m4` you may have to install it manually: +``` +brew install m4 +brew link --force m4 +``` + 2. [Install Rust](https://www.rust-lang.org/tools/install) ``` # recommended approach from https://www.rust-lang.org/tools/install diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 863fa9468f..b2dc265864 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -22,9 +22,10 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds -/// Escape a string for including it in a SQL literal. Wrapping the result -/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use -/// SQL string literal, e.g. `'db'''` or `E'db\\'`. +/// Escape a string for including it in a SQL literal. +/// +/// Wrapping the result with `E'{}'` or `'{}'` is not required, +/// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`. /// See /// for the original implementation. pub fn escape_literal(s: &str) -> String { diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 31777eb7a5..33ca70af96 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -75,14 +75,14 @@ impl PageServerNode { } } - fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document { - toml_edit::Document::from_str(&format!("id={node_id}")).unwrap() + fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::DocumentMut { + toml_edit::DocumentMut::from_str(&format!("id={node_id}")).unwrap() } fn pageserver_init_make_toml( &self, conf: NeonLocalInitPageserverConf, - ) -> anyhow::Result { + ) -> anyhow::Result { assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully"); // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656) @@ -137,9 +137,9 @@ impl PageServerNode { // Turn `overrides` into a toml document. // TODO: above code is legacy code, it should be refactored to use toml_edit directly. - let mut config_toml = toml_edit::Document::new(); + let mut config_toml = toml_edit::DocumentMut::new(); for fragment_str in overrides { - let fragment = toml_edit::Document::from_str(&fragment_str) + let fragment = toml_edit::DocumentMut::from_str(&fragment_str) .expect("all fragments in `overrides` are valid toml documents, this function controls that"); for (key, item) in fragment.iter() { config_toml.insert(key, item.clone()); diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 5cce6cf3ae..651fcda8db 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration}; use clap::{Parser, Subcommand}; use pageserver_api::{ controller_api::{ - NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest, - TenantDescribeResponse, TenantPolicyRequest, + NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy, + TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, }, models::{ EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, @@ -80,7 +80,10 @@ enum Command { /// List nodes known to the storage controller Nodes {}, /// List tenants known to the storage controller - Tenants {}, + Tenants { + /// If this field is set, it will list the tenants on a specific node + node_id: Option, + }, /// Create a new tenant in the storage controller, and by extension on pageservers. TenantCreate { #[arg(long)] @@ -336,7 +339,7 @@ async fn main() -> anyhow::Result<()> { listen_pg_port, listen_http_addr, listen_http_port, - availability_zone_id: Some(availability_zone_id), + availability_zone_id, }), ) .await?; @@ -403,7 +406,41 @@ async fn main() -> anyhow::Result<()> { ) .await?; } - Command::Tenants {} => { + Command::Tenants { + node_id: Some(node_id), + } => { + let describe_response = storcon_client + .dispatch::<(), NodeShardResponse>( + Method::GET, + format!("control/v1/node/{node_id}/shards"), + None, + ) + .await?; + let shards = describe_response.shards; + let mut table = comfy_table::Table::new(); + table.set_header([ + "Shard", + "Intended Primary/Secondary", + "Observed Primary/Secondary", + ]); + for shard in shards { + table.add_row([ + format!("{}", shard.tenant_shard_id), + match shard.is_intended_secondary { + None => "".to_string(), + Some(true) => "Secondary".to_string(), + Some(false) => "Primary".to_string(), + }, + match shard.is_observed_secondary { + None => "".to_string(), + Some(true) => "Secondary".to_string(), + Some(false) => "Primary".to_string(), + }, + ]); + } + println!("{table}"); + } + Command::Tenants { node_id: None } => { let mut resp = storcon_client .dispatch::<(), Vec>( Method::GET, diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index df000cd0fb..cd4526c089 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -68,6 +68,7 @@ macro_rules! register_uint_gauge { static INTERNAL_REGISTRY: Lazy = Lazy::new(Registry::new); /// Register a collector in the internal registry. MUST be called before the first call to `gather()`. +/// /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector /// while holding the lock. pub fn register_internal(c: Box) -> prometheus::Result<()> { diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index b2662c562a..1194ee93ef 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -104,7 +104,9 @@ pub struct ConfigToml { pub image_compression: ImageCompressionAlgorithm, pub ephemeral_bytes_per_memory_kb: usize, pub l0_flush: Option, - pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess, + #[serde(skip_serializing)] + // TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's + pub compact_level0_phase1_value_access: serde::de::IgnoredAny, pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode, pub io_buffer_alignment: usize, } @@ -209,43 +211,6 @@ pub enum GetImpl { #[serde(transparent)] pub struct MaxVectoredReadBytes(pub NonZeroUsize); -#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] -#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] -pub enum CompactL0Phase1ValueAccess { - /// The old way. - PageCachedBlobIo, - /// The new way. - StreamingKmerge { - /// If set, we run both the old way and the new way, validate that - /// they are identical (=> [`CompactL0BypassPageCacheValidation`]), - /// and if the validation fails, - /// - in tests: fail them with a panic or - /// - in prod, log a rate-limited warning and use the old way's results. - /// - /// If not set, we only run the new way and trust its results. - validate: Option, - }, -} - -/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`]. -#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] -#[serde(rename_all = "kebab-case")] -pub enum CompactL0BypassPageCacheValidation { - /// Validate that the series of (key, lsn) pairs are the same. - KeyLsn, - /// Validate that the entire output of old and new way is identical. - KeyLsnValue, -} - -impl Default for CompactL0Phase1ValueAccess { - fn default() -> Self { - CompactL0Phase1ValueAccess::StreamingKmerge { - // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident - validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue), - } - } -} - /// A tenant's calcuated configuration, which is the result of merging a /// tenant's TenantConfOpt with the global TenantConf from PageServerConf. /// @@ -452,7 +417,7 @@ impl Default for ConfigToml { image_compression: (DEFAULT_IMAGE_COMPRESSION), ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: None, - compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), + compact_level0_phase1_value_access: Default::default(), virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(), io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT, diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 345abd69b6..40b7dbbbc2 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::str::FromStr; use std::time::{Duration, Instant}; @@ -57,7 +57,7 @@ pub struct NodeRegisterRequest { pub listen_http_addr: String, pub listen_http_port: u16, - pub availability_zone_id: Option, + pub availability_zone_id: String, } #[derive(Serialize, Deserialize)] @@ -74,6 +74,17 @@ pub struct TenantPolicyRequest { pub scheduling: Option, } +#[derive(Serialize, Deserialize)] +pub struct ShardsPreferredAzsRequest { + #[serde(flatten)] + pub preferred_az_ids: HashMap, +} + +#[derive(Serialize, Deserialize)] +pub struct ShardsPreferredAzsResponse { + pub updated: Vec, +} + #[derive(Serialize, Deserialize, Debug)] pub struct TenantLocateResponseShard { pub shard_id: TenantShardId, @@ -101,6 +112,21 @@ pub struct TenantDescribeResponse { pub config: TenantConfig, } +#[derive(Serialize, Deserialize, Debug)] +pub struct NodeShardResponse { + pub node_id: NodeId, + pub shards: Vec, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct NodeShard { + pub tenant_shard_id: TenantShardId, + /// Whether the shard is observed secondary on a specific node. True = yes, False = no, None = not on this node. + pub is_observed_secondary: Option, + /// Whether the shard is intended to be a secondary on a specific node. True = yes, False = no, None = not on this node. + pub is_intended_secondary: Option, +} + #[derive(Serialize, Deserialize)] pub struct NodeDescribeResponse { pub id: NodeId, @@ -132,8 +158,12 @@ pub struct TenantDescribeResponseShard { pub is_splitting: bool, pub scheduling_policy: ShardSchedulingPolicy, + + pub preferred_az_id: Option, } +/// Migration request for a given tenant shard to a given node. +/// /// Explicitly migrating a particular shard is a low level operation /// TODO: higher level "Reschedule tenant" operation where the request /// specifies some constraints, e.g. asking it to get off particular node(s) diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 77d744e4da..8929ccb41d 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -263,15 +263,6 @@ impl Key { field5: u8::MAX, field6: u32::MAX, }; - /// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers - pub const NON_L0_MAX: Key = Key { - field1: u8::MAX, - field2: u32::MAX, - field3: u32::MAX, - field4: u32::MAX, - field5: u8::MAX, - field6: u32::MAX - 1, - }; pub fn from_hex(s: &str) -> Result { if s.len() != 36 { diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index d13d04eb1b..45e84baa1f 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -62,7 +62,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut}; serde::Serialize, serde::Deserialize, strum_macros::Display, - strum_macros::EnumVariantNames, + strum_macros::VariantNames, strum_macros::AsRefStr, strum_macros::IntoStaticStr, )] @@ -305,8 +305,10 @@ pub struct TenantConfig { pub lsn_lease_length_for_ts: Option, } -/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy` -/// tenant config. When the first aux file written, the policy will be persisted in the +/// The policy for the aux file storage. +/// +/// It can be switched through `switch_aux_file_policy` tenant config. +/// When the first aux file written, the policy will be persisted in the /// `index_part.json` file and has a limited migration path. /// /// Currently, we only allow the following migration path: @@ -896,7 +898,9 @@ pub struct WalRedoManagerStatus { pub process: Option, } -/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating +/// The progress of a secondary tenant. +/// +/// It is mostly useful when doing a long running download: e.g. initiating /// a download job, timing out while waiting for it to run, and then inspecting this status to understand /// what's happening. #[derive(Default, Debug, Serialize, Deserialize, Clone)] diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 7c7c6535b3..8ea4b93fb1 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -69,8 +69,10 @@ impl QueryError { } /// Returns true if the given error is a normal consequence of a network issue, -/// or the client closing the connection. These errors can happen during normal -/// operations, and don't indicate a bug in our code. +/// or the client closing the connection. +/// +/// These errors can happen during normal operations, +/// and don't indicate a bug in our code. pub fn is_expected_io_error(e: &io::Error) -> bool { use io::ErrorKind::*; matches!( @@ -79,17 +81,16 @@ pub fn is_expected_io_error(e: &io::Error) -> bool { ) } -#[async_trait::async_trait] pub trait Handler { /// Handle single query. /// postgres_backend will issue ReadyForQuery after calling this (this /// might be not what we want after CopyData streaming, but currently we don't /// care). It will also flush out the output buffer. - async fn process_query( + fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, - ) -> Result<(), QueryError>; + ) -> impl Future>; /// Called on startup packet receival, allows to process params. /// diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index 7ec85f0dbe..900083ea7f 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -23,7 +23,6 @@ async fn make_tcp_pair() -> (TcpStream, TcpStream) { struct TestHandler {} -#[async_trait::async_trait] impl Handler for TestHandler { // return single col 'hey' for any query async fn process_query( diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index 9f57f3d507..ddf9f7b610 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -7,6 +7,7 @@ use std::fmt; use url::Host; /// Parses a string of format either `host:port` or `host` into a corresponding pair. +/// /// The `host` part should be a correct `url::Host`, while `port` (if present) should be /// a valid decimal u16 of digits only. pub fn parse_host_port>(host_port: S) -> Result<(Host, Option), anyhow::Error> { diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 370d9e9a6f..a346390f3d 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -14,7 +14,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { fn include_file(&self, filename: &str) { // This does the equivalent of passing bindgen::CargoCallbacks // to the builder .parse_callbacks() method. - let cargo_callbacks = bindgen::CargoCallbacks; + let cargo_callbacks = bindgen::CargoCallbacks::new(); cargo_callbacks.include_file(filename) } @@ -121,6 +121,7 @@ fn main() -> anyhow::Result<()> { .allowlist_type("XLogPageHeaderData") .allowlist_type("XLogLongPageHeaderData") .allowlist_var("XLOG_PAGE_MAGIC") + .allowlist_var("PG_MAJORVERSION_NUM") .allowlist_var("PG_CONTROL_FILE_SIZE") .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") .allowlist_type("PageHeaderData") diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 9acb105e9b..f18e0c603b 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -44,6 +44,9 @@ macro_rules! postgres_ffi { // Re-export some symbols from bindings pub use bindings::DBState_DB_SHUTDOWNED; pub use bindings::{CheckPoint, ControlFileData, XLogRecord}; + + pub const ZERO_CHECKPOINT: bytes::Bytes = + bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]); } }; } @@ -106,6 +109,107 @@ macro_rules! dispatch_pgversion { }; } +#[macro_export] +macro_rules! enum_pgversion_dispatch { + ($name:expr, $typ:ident, $bind:ident, $code:block) => { + enum_pgversion_dispatch!( + name = $name, + bind = $bind, + typ = $typ, + code = $code, + pgversions = [ + V14 : v14, + V15 : v15, + V16 : v16, + ] + ) + }; + (name = $name:expr, + bind = $bind:ident, + typ = $typ:ident, + code = $code:block, + pgversions = [$($variant:ident : $md:ident),+ $(,)?]) => { + match $name { + $( + self::$typ::$variant($bind) => { + use $crate::$md as pgv; + $code + } + ),+, + } + }; +} + +#[macro_export] +macro_rules! enum_pgversion { + {$name:ident, pgv :: $t:ident} => { + enum_pgversion!{ + name = $name, + typ = $t, + pgversions = [ + V14 : v14, + V15 : v15, + V16 : v16, + ] + } + }; + {$name:ident, pgv :: $p:ident :: $t:ident} => { + enum_pgversion!{ + name = $name, + path = $p, + typ = $t, + pgversions = [ + V14 : v14, + V15 : v15, + V16 : v16, + ] + } + }; + {name = $name:ident, + typ = $t:ident, + pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => { + pub enum $name { + $($variant ( $crate::$md::$t )),+ + } + impl self::$name { + pub fn pg_version(&self) -> u32 { + enum_pgversion_dispatch!(self, $name, _ign, { + pgv::bindings::PG_MAJORVERSION_NUM + }) + } + } + $( + impl Into for $crate::$md::$t { + fn into(self) -> self::$name { + self::$name::$variant (self) + } + } + )+ + }; + {name = $name:ident, + path = $p:ident, + typ = $t:ident, + pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => { + pub enum $name { + $($variant ($crate::$md::$p::$t)),+ + } + impl $name { + pub fn pg_version(&self) -> u32 { + enum_pgversion_dispatch!(self, $name, _ign, { + pgv::bindings::PG_MAJORVERSION_NUM + }) + } + } + $( + impl Into<$name> for $crate::$md::$p::$t { + fn into(self) -> $name { + $name::$variant (self) + } + } + )+ + }; +} + pub mod pg_constants; pub mod relfile_utils; diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index f819a1572a..d0e92411da 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -185,7 +185,7 @@ mod tests { use super::*; fn parse(input: &str) -> anyhow::Result { - let toml = input.parse::().unwrap(); + let toml = input.parse::().unwrap(); RemoteStorageConfig::from_toml(toml.as_item()) } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index cc1d3e0ae4..b5b69c9faf 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -45,6 +45,8 @@ pub use azure_core::Etag; pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; +/// Default concurrency limit for S3 operations +/// /// Currently, sync happens with AWS S3, that has two limits on requests per second: /// ~200 RPS for IAM services /// @@ -300,7 +302,9 @@ pub trait RemoteStorage: Send + Sync + 'static { ) -> Result<(), TimeTravelError>; } -/// DownloadStream is sensitive to the timeout and cancellation used with the original +/// Data part of an ongoing [`Download`]. +/// +/// `DownloadStream` is sensitive to the timeout and cancellation used with the original /// [`RemoteStorage::download`] request. The type yields `std::io::Result` to be compatible /// with `tokio::io::copy_buf`. // This has 'static because safekeepers do not use cancellation tokens (yet) diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 2fbc333075..28666d197a 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -60,3 +60,16 @@ pub struct TimelineCopyRequest { pub target_timeline_id: TimelineId, pub until_lsn: Lsn, } + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct TimelineTermBumpRequest { + /// bump to + pub term: Option, +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct TimelineTermBumpResponse { + // before the request + pub previous_term: u64, + pub current_term: u64, +} diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs index a3e12cf0e3..974a498404 100644 --- a/libs/tenant_size_model/src/lib.rs +++ b/libs/tenant_size_model/src/lib.rs @@ -5,9 +5,10 @@ mod calculation; pub mod svg; -/// StorageModel is the input to the synthetic size calculation. It represents -/// a tree of timelines, with just the information that's needed for the -/// calculation. This doesn't track timeline names or where each timeline +/// StorageModel is the input to the synthetic size calculation. +/// +/// It represents a tree of timelines, with just the information that's needed +/// for the calculation. This doesn't track timeline names or where each timeline /// begins and ends, for example. Instead, it consists of "points of interest" /// on the timelines. A point of interest could be the timeline start or end point, /// the oldest point on a timeline that needs to be retained because of PITR diff --git a/libs/utils/src/circuit_breaker.rs b/libs/utils/src/circuit_breaker.rs index 720ea39d4f..e1ddfd8650 100644 --- a/libs/utils/src/circuit_breaker.rs +++ b/libs/utils/src/circuit_breaker.rs @@ -5,8 +5,10 @@ use std::{ use metrics::IntCounter; -/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly, -/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and +/// Circuit breakers are for operations that are expensive and fallible. +/// +/// If a circuit breaker fails repeatedly, we will stop attempting it for some +/// period of time, to avoid denial-of-service from retries, and /// to mitigate the log spam from repeated failures. pub struct CircuitBreaker { /// An identifier that enables us to log useful errors when a circuit is broken diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 756b19138c..b97c6c7a45 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -1,3 +1,4 @@ +use std::os::fd::AsRawFd; use std::{ borrow::Cow, fs::{self, File}, @@ -203,6 +204,27 @@ pub fn overwrite( Ok(()) } +/// Syncs the filesystem for the given file descriptor. +#[cfg_attr(target_os = "macos", allow(unused_variables))] +pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> { + // Linux guarantees durability for syncfs. + // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync(). + #[cfg(target_os = "linux")] + { + use anyhow::Context; + nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?; + } + #[cfg(target_os = "macos")] + { + // macOS is not a production platform for Neon, don't even bother. + } + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + { + compile_error!("Unsupported OS"); + } + Ok(()) +} + #[cfg(test)] mod tests { diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index db468e3054..2cda899b15 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -249,8 +249,10 @@ macro_rules! id_newtype { }; } -/// Neon timeline IDs are different from PostgreSQL timeline -/// IDs. They serve a similar purpose though: they differentiate +/// Neon timeline ID. +/// +/// They are different from PostgreSQL timeline +/// IDs, but serve a similar purpose: they differentiate /// between different "histories" of the same cluster. However, /// PostgreSQL timeline IDs are a bit cumbersome, because they are only /// 32-bits wide, and they must be in ascending order in any given diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index 59c66ca757..3a2ed3e830 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -100,7 +100,9 @@ pub enum LockFileRead { } /// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to -/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked. +/// inspect its content. +/// +/// It is not an `Err(...)` if the file does not exist or is already locked. /// Check the [`LockFileRead`] variants for details. pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result { let res = fs::OpenOptions::new().read(true).open(path); diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 71af43a4da..e205d60d74 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -3,11 +3,9 @@ use std::str::FromStr; use anyhow::Context; use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; -use strum_macros::{EnumString, EnumVariantNames}; +use strum_macros::{EnumString, VariantNames}; -#[derive( - EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy, -)] +#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)] #[strum(serialize_all = "snake_case")] pub enum LogFormat { Plain, @@ -190,7 +188,7 @@ impl Drop for TracingPanicHookGuard { } /// Named symbol for our panic hook, which logs the panic. -fn tracing_panic_hook(info: &std::panic::PanicInfo) { +fn tracing_panic_hook(info: &std::panic::PanicHookInfo) { // following rust 1.66.1 std implementation: // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288 let location = info.location(); diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs index 3ddfa44f41..dede65e699 100644 --- a/libs/utils/src/pageserver_feedback.rs +++ b/libs/utils/src/pageserver_feedback.rs @@ -8,6 +8,7 @@ use tracing::{trace, warn}; use crate::lsn::Lsn; /// Feedback pageserver sends to safekeeper and safekeeper resends to compute. +/// /// Serialized in custom flexible key/value format. In replication protocol, it /// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres /// Standby status update / Hot standby feedback messages. diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs index 27378c69fc..c3e2fba20c 100644 --- a/libs/utils/src/poison.rs +++ b/libs/utils/src/poison.rs @@ -65,6 +65,8 @@ impl Poison { } } +/// Armed pointer to a [`Poison`]. +/// /// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state. /// Once modifications are done, use [`Self::disarm`]. /// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index f6b430657e..d146010b41 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -13,10 +13,11 @@ pub struct ShardNumber(pub u8); #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardCount(pub u8); -/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant, -/// when we need to know which shard we're dealing with, but do not need to know the full -/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know -/// the fully qualified TenantShardId. +/// Combination of ShardNumber and ShardCount. +/// +/// For use within the context of a particular tenant, when we need to know which shard we're +/// dealing with, but do not need to know the full ShardIdentity (because we won't be doing +/// any page->shard mapping), and do not need to know the fully qualified TenantShardId. #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] pub struct ShardIndex { pub shard_number: ShardNumber, diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs index ecc5353be3..01750b2aef 100644 --- a/libs/utils/src/simple_rcu.rs +++ b/libs/utils/src/simple_rcu.rs @@ -49,12 +49,11 @@ use std::sync::{RwLock, RwLockWriteGuard}; use tokio::sync::watch; -/// /// Rcu allows multiple readers to read and hold onto a value without blocking -/// (for very long). Storing to the Rcu updates the value, making new readers -/// immediately see the new value, but it also waits for all current readers to -/// finish. +/// (for very long). /// +/// Storing to the Rcu updates the value, making new readers immediately see +/// the new value, but it also waits for all current readers to finish. pub struct Rcu { inner: RwLock>, } diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 1abd3d9861..dc711fb028 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -5,7 +5,9 @@ use std::sync::{ use tokio::sync::Semaphore; /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of -/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard +/// `SemaphorePermit`. +/// +/// Allows use of `take` which does not require holding an outer mutex guard /// for the duration of initialization. /// /// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`]. diff --git a/libs/utils/src/toml_edit_ext.rs b/libs/utils/src/toml_edit_ext.rs index ab5f7bdd95..1359e27b77 100644 --- a/libs/utils/src/toml_edit_ext.rs +++ b/libs/utils/src/toml_edit_ext.rs @@ -10,7 +10,7 @@ pub fn deserialize_item(item: &toml_edit::Item) -> Result where T: serde::de::DeserializeOwned, { - let document: toml_edit::Document = match item { + let document: toml_edit::DocumentMut = match item { toml_edit::Item::Table(toml) => toml.clone().into(), toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => { toml.clone().into_table().into() diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs index 18b2af14f1..5f0028bacd 100644 --- a/libs/utils/src/vec_map.rs +++ b/libs/utils/src/vec_map.rs @@ -7,6 +7,7 @@ pub enum VecMapOrdering { } /// Ordered map datastructure implemented in a Vec. +/// /// Append only - can only add keys that are larger than the /// current max key. /// Ordering can be adjusted using [`VecMapOrdering`] diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs index 963279eb4c..68274f0631 100644 --- a/libs/utils/src/yielding_loop.rs +++ b/libs/utils/src/yielding_loop.rs @@ -6,9 +6,10 @@ pub enum YieldingLoopError { Cancelled, } -/// Helper for long synchronous loops, e.g. over all tenants in the system. Periodically -/// yields to avoid blocking the executor, and after resuming checks the provided -/// cancellation token to drop out promptly on shutdown. +/// Helper for long synchronous loops, e.g. over all tenants in the system. +/// +/// Periodically yields to avoid blocking the executor, and after resuming +/// checks the provided cancellation token to drop out promptly on shutdown. #[inline(always)] pub async fn yielding_loop( interval: usize, @@ -23,7 +24,7 @@ where for (i, item) in iter.enumerate() { visitor(item); - if i + 1 % interval == 0 { + if (i + 1) % interval == 0 { tokio::task::yield_now().await; if cancel.is_cancelled() { return Err(YieldingLoopError::Cancelled); diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index 7bb077062b..28547f52bf 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -4,7 +4,6 @@ use std::{env, path::PathBuf, process::Command}; use anyhow::{anyhow, Context}; -use bindgen::CargoCallbacks; fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes @@ -64,16 +63,25 @@ fn main() -> anyhow::Result<()> { .map_err(|s| anyhow!("Bad postgres server path {s:?}"))? }; + let unwind_abi_functions = [ + "log_internal", + "recovery_download", + "start_streaming", + "finish_sync_safekeepers", + "wait_event_set", + "WalProposerStart", + ]; + // The bindgen::Builder is the main entry point // to bindgen, and lets you build up options for // the resulting bindings. - let bindings = bindgen::Builder::default() + let mut builder = bindgen::Builder::default() // The input header we would like to generate // bindings for. .header("bindgen_deps.h") // Tell cargo to invalidate the built crate whenever any of the // included header files changed. - .parse_callbacks(Box::new(CargoCallbacks)) + .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) .allowlist_type("WalProposer") .allowlist_type("WalProposerConfig") .allowlist_type("walproposer_api") @@ -105,7 +113,12 @@ fn main() -> anyhow::Result<()> { .allowlist_var("WL_SOCKET_MASK") .clang_arg("-DWALPROPOSER_LIB") .clang_arg(format!("-I{pgxn_neon}")) - .clang_arg(format!("-I{inc_server_path}")) + .clang_arg(format!("-I{inc_server_path}")); + + for name in unwind_abi_functions { + builder = builder.override_abi(bindgen::Abi::CUnwind, name); + } + let bindings = builder // Finish the builder and generate the bindings. .generate() // Unwrap the Result and panic on failure. diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index bbc3663402..2fbea3fe45 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -33,7 +33,7 @@ extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemStat } } -extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) { +extern "C-unwind" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; @@ -187,7 +187,7 @@ extern "C" fn conn_blocking_write( } } -extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool { +extern "C-unwind" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool { unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; @@ -272,7 +272,7 @@ extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) { } } -extern "C" fn wait_event_set( +extern "C-unwind" fn wait_event_set( wp: *mut WalProposer, timeout: ::std::os::raw::c_long, event_sk: *mut *mut Safekeeper, @@ -324,7 +324,7 @@ extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr { } } -extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) { +extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; @@ -340,7 +340,7 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekee } } -extern "C" fn log_internal( +extern "C-unwind" fn log_internal( wp: *mut WalProposer, level: ::std::os::raw::c_int, line: *const ::std::os::raw::c_char, diff --git a/pageserver/client/src/lib.rs b/pageserver/client/src/lib.rs index 4a3f4dea47..cc8db37173 100644 --- a/pageserver/client/src/lib.rs +++ b/pageserver/client/src/lib.rs @@ -1,2 +1,20 @@ pub mod mgmt_api; pub mod page_service; + +/// For timeline_block_unblock_gc, distinguish the two different operations. This could be a bool. +// If file structure is per-kind not per-feature then where to put this? +#[derive(Clone, Copy)] +pub enum BlockUnblock { + Block, + Unblock, +} + +impl std::fmt::Display for BlockUnblock { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + BlockUnblock::Block => "block", + BlockUnblock::Unblock => "unblock", + }; + f.write_str(s) + } +} diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 737cb00835..a68f45a6d9 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -12,6 +12,8 @@ use utils::{ pub use reqwest::Body as ReqwestBody; +use crate::BlockUnblock; + pub mod util; #[derive(Debug, Clone)] @@ -454,6 +456,20 @@ impl Client { .map_err(Error::ReceiveBody) } + pub async fn timeline_block_unblock_gc( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + dir: BlockUnblock, + ) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/{dir}_gc", + self.mgmt_api_endpoint, + ); + + self.request(Method::POST, &uri, ()).await.map(|_| ()) + } + pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> { let uri = format!( "{}/v1/tenant/{}/reset", diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 3b66b0c4aa..cf001ef0d5 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -174,7 +174,7 @@ async fn main() -> anyhow::Result<()> { println!("specified prefix '{}' failed validation", cmd.prefix); return Ok(()); }; - let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?; + let toml_document = toml_edit::DocumentMut::from_str(&cmd.config_toml_str)?; let toml_item = toml_document .get("remote_storage") .expect("need remote_storage"); diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 2c60e8d7d1..d15a0e47a4 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -37,6 +37,7 @@ use pageserver::{ virtual_file, }; use postgres_backend::AuthType; +use utils::crashsafe::syncfs; use utils::failpoint_support; use utils::logging::TracingErrorLayerEnablement; use utils::{ @@ -125,7 +126,6 @@ fn main() -> anyhow::Result<()> { // after setting up logging, log the effective IO engine choice and read path implementations info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings"); - info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access"); info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment"); // The tenants directory contains all the pageserver local disk state. @@ -156,23 +156,7 @@ fn main() -> anyhow::Result<()> { }; let started = Instant::now(); - // Linux guarantees durability for syncfs. - // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync(). - #[cfg(target_os = "linux")] - { - use std::os::fd::AsRawFd; - nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?; - } - #[cfg(target_os = "macos")] - { - // macOS is not a production platform for Neon, don't even bother. - drop(dirfd); - } - #[cfg(not(any(target_os = "linux", target_os = "macos")))] - { - compile_error!("Unsupported OS"); - } - + syncfs(dirfd)?; let elapsed = started.elapsed(); info!( elapsed_ms = elapsed.as_millis(), diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index c159b66905..29a98855d3 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -174,16 +174,14 @@ pub struct PageServerConf { pub l0_flush: crate::l0_flush::L0FlushConfig, - /// This flag is temporary and will be removed after gradual rollout. - /// See . - pub compact_level0_phase1_value_access: pageserver_api::config::CompactL0Phase1ValueAccess, - /// Direct IO settings pub virtual_file_direct_io: virtual_file::DirectIoMode, pub io_buffer_alignment: usize, } +/// Token for authentication to safekeepers +/// /// We do not want to store this in a PageServerConf because the latter may be logged /// and/or serialized at a whim, while the token is secret. Currently this token is the /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in @@ -338,7 +336,7 @@ impl PageServerConf { max_vectored_read_bytes, image_compression, ephemeral_bytes_per_memory_kb, - compact_level0_phase1_value_access, + compact_level0_phase1_value_access: _, l0_flush, virtual_file_direct_io, concurrent_tenant_warmup, @@ -383,7 +381,6 @@ impl PageServerConf { max_vectored_read_bytes, image_compression, ephemeral_bytes_per_memory_kb, - compact_level0_phase1_value_access, virtual_file_direct_io, io_buffer_alignment, @@ -561,6 +558,16 @@ mod tests { .expect("parse_and_validate"); } + #[test] + fn test_compactl0_phase1_access_mode_is_ignored_silently() { + let input = indoc::indoc! {r#" + [compact_level0_phase1_value_access] + mode = "streaming-kmerge" + validate = "key-lsn-value" + "#}; + toml_edit::de::from_str::(input).unwrap(); + } + /// If there's a typo in the pageserver config, we'd rather catch that typo /// and fail pageserver startup than silently ignoring the typo, leaving whoever /// made it in the believe that their config change is effective. @@ -637,14 +644,5 @@ mod tests { // some_invalid_field = 23 // "#} // ); - - test!( - compact_level0_phase1_value_access, - indoc! {r#" - [compact_level0_phase1_value_access] - mode = "streaming-kmerge" - some_invalid_field = 23 - "#} - ); } } diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index 012cb8d96f..7afcf52cf2 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -1,7 +1,9 @@ -//! This module defines `RequestContext`, a structure that we use throughout -//! the pageserver to propagate high-level context from places -//! that _originate_ activity down to the shared code paths at the -//! heart of the pageserver. It's inspired by Golang's `context.Context`. +//! Defines [`RequestContext`]. +//! +//! It is a structure that we use throughout the pageserver to propagate +//! high-level context from places that _originate_ activity down to the +//! shared code paths at the heart of the pageserver. It's inspired by +//! Golang's `context.Context`. //! //! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions: //! 1. What high-level activity ([`TaskKind`]) needs this page? diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 56a536c387..f6d1c35a8c 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -141,10 +141,24 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { m.other ); - let az_id = m - .other - .get("availability_zone_id") - .and_then(|jv| jv.as_str().map(|str| str.to_owned())); + let az_id = { + let az_id_from_metadata = m + .other + .get("availability_zone_id") + .and_then(|jv| jv.as_str().map(|str| str.to_owned())); + + match az_id_from_metadata { + Some(az_id) => Some(az_id), + None => { + tracing::warn!("metadata.json does not contain an 'availability_zone_id' field"); + conf.availability_zone.clone() + } + } + }; + + if az_id.is_none() { + panic!("Availablity zone id could not be inferred from metadata.json or pageserver config"); + } Some(NodeRegisterRequest { node_id: conf.id, @@ -152,7 +166,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { listen_pg_port: m.postgres_port, listen_http_addr: m.http_host, listen_http_port: m.http_port, - availability_zone_id: az_id, + availability_zone_id: az_id.expect("Checked above"), }) } Err(e) => { diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index c4011d593c..9197505876 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -9,7 +9,7 @@ use metrics::{ use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; use strum::{EnumCount, VariantNames}; -use strum_macros::{EnumVariantNames, IntoStaticStr}; +use strum_macros::{IntoStaticStr, VariantNames}; use tracing::warn; use utils::id::TimelineId; @@ -27,7 +27,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[ ]; // Metrics collected on operations on the storage repository. -#[derive(Debug, EnumVariantNames, IntoStaticStr)] +#[derive(Debug, VariantNames, IntoStaticStr)] #[strum(serialize_all = "kebab_case")] pub(crate) enum StorageTimeOperation { #[strum(serialize = "layer flush")] diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 39c6a6fb74..9261b7481d 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1199,7 +1199,6 @@ impl PageServerHandler { } } -#[async_trait::async_trait] impl postgres_backend::Handler for PageServerHandler where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index d28a214265..6dd8851b13 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1021,9 +1021,10 @@ impl Timeline { } /// DatadirModification represents an operation to ingest an atomic set of -/// updates to the repository. It is created by the 'begin_record' -/// function. It is called for each WAL record, so that all the modifications -/// by a one WAL record appear atomic. +/// updates to the repository. +/// +/// It is created by the 'begin_record' function. It is called for each WAL +/// record, so that all the modifications by a one WAL record appear atomic. pub struct DatadirModification<'a> { /// The timeline this modification applies to. You can access this to /// read the state, but note that any pending updates are *not* reflected @@ -1204,6 +1205,13 @@ impl<'a> DatadirModification<'a> { img: Bytes, ) -> anyhow::Result<()> { anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + let key = rel_block_to_key(rel, blknum); + if !key.is_valid_key_on_write_path() { + anyhow::bail!( + "the request contains data not supported by pageserver at {}", + key + ); + } self.put(rel_block_to_key(rel, blknum), Value::Image(img)); Ok(()) } @@ -1215,14 +1223,34 @@ impl<'a> DatadirModification<'a> { blknum: BlockNumber, img: Bytes, ) -> anyhow::Result<()> { - self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img)); + let key = slru_block_to_key(kind, segno, blknum); + if !key.is_valid_key_on_write_path() { + anyhow::bail!( + "the request contains data not supported by pageserver at {}", + key + ); + } + self.put(key, Value::Image(img)); Ok(()) } - pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) { - self.pending_zero_data_pages - .insert(rel_block_to_key(rel, blknum).to_compact()); + pub(crate) fn put_rel_page_image_zero( + &mut self, + rel: RelTag, + blknum: BlockNumber, + ) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + let key = rel_block_to_key(rel, blknum); + if !key.is_valid_key_on_write_path() { + anyhow::bail!( + "the request contains data not supported by pageserver: {} @ {}", + key, + self.lsn + ); + } + self.pending_zero_data_pages.insert(key.to_compact()); self.pending_bytes += ZERO_PAGE.len(); + Ok(()) } pub(crate) fn put_slru_page_image_zero( @@ -1230,10 +1258,18 @@ impl<'a> DatadirModification<'a> { kind: SlruKind, segno: u32, blknum: BlockNumber, - ) { - self.pending_zero_data_pages - .insert(slru_block_to_key(kind, segno, blknum).to_compact()); + ) -> anyhow::Result<()> { + let key = slru_block_to_key(kind, segno, blknum); + if !key.is_valid_key_on_write_path() { + anyhow::bail!( + "the request contains data not supported by pageserver: {} @ {}", + key, + self.lsn + ); + } + self.pending_zero_data_pages.insert(key.to_compact()); self.pending_bytes += ZERO_PAGE.len(); + Ok(()) } /// Call this at the end of each WAL record. @@ -2048,6 +2084,7 @@ impl<'a> DatadirModification<'a> { /// This struct facilitates accessing either a committed key from the timeline at a /// specific LSN, or the latest uncommitted key from a pending modification. +/// /// During WAL ingestion, the records from multiple LSNs may be batched in the same /// modification before being flushed to the timeline. Hence, the routines in WalIngest /// need to look up the keys in the modification first before looking them up in the diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index fb30857ddf..c6f0e48101 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1,8 +1,9 @@ +//! Timeline repository implementation that keeps old data in layer files, and +//! the recent changes in ephemeral files. //! -//! Timeline repository implementation that keeps old data in files on disk, and -//! the recent changes in memory. See tenant/*_layer.rs files. -//! The functions here are responsible for locating the correct layer for the -//! get/put call, walking back the timeline branching history as needed. +//! See tenant/*_layer.rs files. The functions here are responsible for locating +//! the correct layer for the get/put call, walking back the timeline branching +//! history as needed. //! //! The files are stored in the .neon/tenants//timelines/ //! directory. See docs/pageserver-storage.md for how the files are managed. @@ -7090,13 +7091,13 @@ mod tests { vec![ // Image layer at GC horizon PersistentLayerKey { - key_range: Key::MIN..Key::NON_L0_MAX, + key_range: Key::MIN..Key::MAX, lsn_range: Lsn(0x30)..Lsn(0x31), is_delta: false }, - // The delta layer covers the full range (with the layer key hack to avoid being recognized as L0) + // The delta layer below the horizon PersistentLayerKey { - key_range: Key::MIN..Key::NON_L0_MAX, + key_range: get_key(3)..get_key(4), lsn_range: Lsn(0x30)..Lsn(0x48), is_delta: true }, diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 7e0344666b..547b43a399 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -452,7 +452,8 @@ impl TryFrom for TenantConfOpt { .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message())); } toml_edit::Item::Table(table) => { - let deserializer = toml_edit::de::Deserializer::new(table.into()); + let deserializer = + toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table)); return serde_path_to_error::deserialize(deserializer) .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message())); } diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 190316df42..24440d4b35 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -1,7 +1,8 @@ -//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in -//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines, -//! this struct and it's original serialization format is still needed because they were written a -//! long time ago. +//! Describes the legacy now hopefully no longer modified per-timeline metadata. +//! +//! It is stored in `index_part.json` managed by [`remote_timeline_client`]. For many tenants and +//! their timelines, this struct and its original serialization format is still needed because +//! they were written a long time ago. //! //! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json //! versioning. diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 4e6ea0c8f9..2104f41531 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -282,9 +282,10 @@ impl BackgroundPurges { static TENANTS: Lazy> = Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing)); -/// The TenantManager is responsible for storing and mutating the collection of all tenants -/// that this pageserver process has state for. Every Tenant and SecondaryTenant instance -/// lives inside the TenantManager. +/// Responsible for storing and mutating the collection of all tenants +/// that this pageserver has state for. +/// +/// Every Tenant and SecondaryTenant instance lives inside the TenantManager. /// /// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach /// the same tenant twice concurrently, or trying to configure the same tenant into secondary @@ -2346,8 +2347,9 @@ pub enum TenantMapError { ShuttingDown, } -/// Guards a particular tenant_id's content in the TenantsMap. While this -/// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`] +/// Guards a particular tenant_id's content in the TenantsMap. +/// +/// While this structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`] /// for this tenant, which acts as a marker for any operations targeting /// this tenant to retry later, or wait for the InProgress state to end. /// diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 71b766e4c7..1f9ae40af5 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -2184,6 +2184,8 @@ pub fn remote_timeline_path( remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string())) } +/// Obtains the path of the given Layer in the remote +/// /// Note that the shard component of a remote layer path is _not_ always the same /// as in the TenantShardId of the caller: tenants may reference layers from a different /// ShardIndex. Use the ShardIndex from the layer's metadata. diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index d9725ad756..9fbe2f0da5 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -548,7 +548,7 @@ pub(crate) async fn download_initdb_tar_zst( cancel, ) .await - .map_err(|e| { + .inspect_err(|_e| { // Do a best-effort attempt at deleting the temporary file upon encountering an error. // We don't have async here nor do we want to pile on any extra errors. if let Err(e) = std::fs::remove_file(&temp_path) { @@ -556,7 +556,6 @@ pub(crate) async fn download_initdb_tar_zst( warn!("error deleting temporary file {temp_path}: {e}"); } } - e })?; Ok((temp_path, file)) diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 757fb9d032..c51ff54919 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -1,4 +1,5 @@ //! In-memory index to track the tenant files on the remote storage. +//! //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about //! remote timeline layers and its metadata. diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index a1202ad507..dac6b2f893 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -434,10 +434,11 @@ impl ReadableLayer { } } -/// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather -/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility -/// of layers (for example when creating a branch that makes some previously covered layers visible). It should -/// be used for cache management but not for correctness-critical checks. +/// Layers contain a hint indicating whether they are likely to be used for reads. +/// +/// This is a hint rather than an authoritative value, so that we do not have to update it synchronously +/// when changing the visibility of layers (for example when creating a branch that makes some previously +/// covered layers visible). It should be used for cache management but not for correctness-critical checks. #[derive(Debug, Clone, PartialEq, Eq)] pub enum LayerVisibilityHint { /// A Visible layer might be read while serving a read, because there is not an image layer between it diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 6a2cd94232..34f1b15138 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -136,10 +136,11 @@ impl Summary { // Flag indicating that this version initialize the page const WILL_INIT: u64 = 1; -/// Struct representing reference to BLOB in layers. Reference contains BLOB -/// offset, and for WAL records it also contains `will_init` flag. The flag -/// helps to determine the range of records that needs to be applied, without -/// reading/deserializing records themselves. +/// Struct representing reference to BLOB in layers. +/// +/// Reference contains BLOB offset, and for WAL records it also contains +/// `will_init` flag. The flag helps to determine the range of records +/// that needs to be applied, without reading/deserializing records themselves. #[derive(Debug, Serialize, Deserialize, Copy, Clone)] pub struct BlobRef(pub u64); diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 77ce1ae670..875e223c9c 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -1,7 +1,9 @@ //! An ImageLayer represents an image or a snapshot of a key-range at -//! one particular LSN. It contains an image of all key-value pairs -//! in its key-range. Any key that falls into the image layer's range -//! but does not exist in the layer, does not exist. +//! one particular LSN. +//! +//! It contains an image of all key-value pairs in its key-range. Any key +//! that falls into the image layer's range but does not exist in the layer, +//! does not exist. //! //! An image layer is stored in a file on disk. The file is stored in //! timelines/ directory. Currently, there are no diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index cbd18e650f..e90ff3c4b2 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -12,8 +12,10 @@ use serde::{Deserialize, Serialize}; #[cfg(test)] use utils::id::TenantId; -/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the -/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides +/// A unique identifier of a persistent layer. +/// +/// This is different from `LayerDescriptor`, which is only used in the benchmarks. +/// This struct contains all necessary information to find the image / delta layer. It also provides /// a unified way to generate layer information like file name. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)] pub struct PersistentLayerDesc { diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs index 47ae556279..ffe7ca5f3e 100644 --- a/pageserver/src/tenant/storage_layer/layer_name.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -217,8 +217,9 @@ impl fmt::Display for ImageLayerName { } } -/// LayerName is the logical identity of a layer within a LayerMap at a moment in time. The -/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations +/// LayerName is the logical identity of a layer within a LayerMap at a moment in time. +/// +/// The LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations /// over time (e.g. across shard splits or compression). The physical filenames of layers in local /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path]) diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index d2c341e5ce..0831fd9530 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -226,9 +226,11 @@ impl<'a> IteratorWrapper<'a> { } } -/// A merge iterator over delta/image layer iterators. When duplicated records are -/// found, the iterator will not perform any deduplication, and the caller should handle -/// these situation. By saying duplicated records, there are many possibilities: +/// A merge iterator over delta/image layer iterators. +/// +/// When duplicated records are found, the iterator will not perform any +/// deduplication, and the caller should handle these situation. By saying +/// duplicated records, there are many possibilities: /// /// * Two same delta at the same LSN. /// * Two same image at the same LSN. diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs index e8deb0a1e5..40a6a77a50 100644 --- a/pageserver/src/tenant/storage_layer/split_writer.rs +++ b/pageserver/src/tenant/storage_layer/split_writer.rs @@ -34,9 +34,10 @@ impl SplitWriterResult { } } -/// An image writer that takes images and produces multiple image layers. The interface does not -/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files -/// to be cleaned up) +/// An image writer that takes images and produces multiple image layers. +/// +/// The interface does not guarantee atomicity (i.e., if the image layer generation +/// fails, there might be leftover files to be cleaned up) #[must_use] pub struct SplitImageLayerWriter { inner: ImageLayerWriter, @@ -187,22 +188,23 @@ impl SplitImageLayerWriter { .await } - /// When split writer fails, the caller should call this function and handle partially generated layers. + /// This function will be deprecated with #8841. pub(crate) fn take(self) -> anyhow::Result<(Vec, ImageLayerWriter)> { Ok((self.generated_layers, self.inner)) } } -/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not -/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files -/// to be cleaned up). +/// A delta writer that takes key-lsn-values and produces multiple delta layers. +/// +/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails, +/// there might be leftover files to be cleaned up). /// /// Note that if updates of a single key exceed the target size limit, all of the updates will be batched /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm /// will split them into multiple files based on size. #[must_use] pub struct SplitDeltaLayerWriter { - inner: DeltaLayerWriter, + inner: Option<(Key, DeltaLayerWriter)>, target_layer_size: u64, generated_layers: Vec, conf: &'static PageServerConf, @@ -210,7 +212,6 @@ pub struct SplitDeltaLayerWriter { tenant_shard_id: TenantShardId, lsn_range: Range, last_key_written: Key, - start_key: Key, } impl SplitDeltaLayerWriter { @@ -218,29 +219,18 @@ impl SplitDeltaLayerWriter { conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, - start_key: Key, lsn_range: Range, target_layer_size: u64, - ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { target_layer_size, - inner: DeltaLayerWriter::new( - conf, - timeline_id, - tenant_shard_id, - start_key, - lsn_range.clone(), - ctx, - ) - .await?, + inner: None, generated_layers: Vec::new(), conf, timeline_id, tenant_shard_id, lsn_range, last_key_written: Key::MIN, - start_key, }) } @@ -263,9 +253,26 @@ impl SplitDeltaLayerWriter { // // Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction // strategy. https://github.com/neondatabase/neon/issues/8837 + + if self.inner.is_none() { + self.inner = Some(( + key, + DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + key, + self.lsn_range.clone(), + ctx, + ) + .await?, + )); + } + let (_, inner) = self.inner.as_mut().unwrap(); + let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */; - if self.inner.num_keys() >= 1 - && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size + if inner.num_keys() >= 1 + && inner.estimated_size() + addition_size_estimation >= self.target_layer_size { if key != self.last_key_written { let next_delta_writer = DeltaLayerWriter::new( @@ -277,13 +284,13 @@ impl SplitDeltaLayerWriter { ctx, ) .await?; - let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer); + let (start_key, prev_delta_writer) = + std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap(); let layer_key = PersistentLayerKey { - key_range: self.start_key..key, + key_range: start_key..key, lsn_range: self.lsn_range.clone(), is_delta: true, }; - self.start_key = key; if discard(&layer_key).await { drop(prev_delta_writer); self.generated_layers @@ -294,17 +301,18 @@ impl SplitDeltaLayerWriter { self.generated_layers .push(SplitWriterResult::Produced(delta_layer)); } - } else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT { + } else if inner.estimated_size() >= S3_UPLOAD_LIMIT { // We have to produce a very large file b/c a key is updated too often. anyhow::bail!( "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced", key, - self.inner.estimated_size() + inner.estimated_size() ); } } self.last_key_written = key; - self.inner.put_value(key, lsn, val, ctx).await + let (_, inner) = self.inner.as_mut().unwrap(); + inner.put_value(key, lsn, val, ctx).await } pub async fn put_value( @@ -323,7 +331,6 @@ impl SplitDeltaLayerWriter { self, tline: &Arc, ctx: &RequestContext, - end_key: Key, discard: D, ) -> anyhow::Result> where @@ -335,11 +342,15 @@ impl SplitDeltaLayerWriter { inner, .. } = self; + let Some((start_key, inner)) = inner else { + return Ok(generated_layers); + }; if inner.num_keys() == 0 { return Ok(generated_layers); } + let end_key = self.last_key_written.next(); let layer_key = PersistentLayerKey { - key_range: self.start_key..end_key, + key_range: start_key..end_key, lsn_range: self.lsn_range.clone(), is_delta: true, }; @@ -358,15 +369,14 @@ impl SplitDeltaLayerWriter { self, tline: &Arc, ctx: &RequestContext, - end_key: Key, ) -> anyhow::Result> { - self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false }) + self.finish_with_discard_fn(tline, ctx, |_| async { false }) .await } - /// When split writer fails, the caller should call this function and handle partially generated layers. - pub(crate) fn take(self) -> anyhow::Result<(Vec, DeltaLayerWriter)> { - Ok((self.generated_layers, self.inner)) + /// This function will be deprecated with #8841. + pub(crate) fn take(self) -> anyhow::Result<(Vec, Option)> { + Ok((self.generated_layers, self.inner.map(|x| x.1))) } } @@ -430,10 +440,8 @@ mod tests { tenant.conf, tline.timeline_id, tenant.tenant_shard_id, - get_key(0), Lsn(0x18)..Lsn(0x20), 4 * 1024 * 1024, - &ctx, ) .await .unwrap(); @@ -458,11 +466,22 @@ mod tests { ) .await .unwrap(); - let layers = delta_writer - .finish(&tline, &ctx, get_key(10)) - .await - .unwrap(); + let layers = delta_writer.finish(&tline, &ctx).await.unwrap(); assert_eq!(layers.len(), 1); + assert_eq!( + layers + .into_iter() + .next() + .unwrap() + .into_resident_layer() + .layer_desc() + .key(), + PersistentLayerKey { + key_range: get_key(0)..get_key(1), + lsn_range: Lsn(0x18)..Lsn(0x20), + is_delta: true + } + ); } #[tokio::test] @@ -499,10 +518,8 @@ mod tests { tenant.conf, tline.timeline_id, tenant.tenant_shard_id, - get_key(0), Lsn(0x18)..Lsn(0x20), 4 * 1024 * 1024, - &ctx, ) .await .unwrap(); @@ -531,10 +548,7 @@ mod tests { .finish(&tline, &ctx, get_key(N as u32)) .await .unwrap(); - let delta_layers = delta_writer - .finish(&tline, &ctx, get_key(N as u32)) - .await - .unwrap(); + let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap(); if discard { for layer in image_layers { layer.into_discarded_layer(); @@ -553,6 +567,14 @@ mod tests { .collect_vec(); assert_eq!(image_layers.len(), N / 512 + 1); assert_eq!(delta_layers.len(), N / 512 + 1); + assert_eq!( + delta_layers.first().unwrap().layer_desc().key_range.start, + get_key(0) + ); + assert_eq!( + delta_layers.last().unwrap().layer_desc().key_range.end, + get_key(N as u32) + ); for idx in 0..image_layers.len() { assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN); assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX); @@ -600,10 +622,8 @@ mod tests { tenant.conf, tline.timeline_id, tenant.tenant_shard_id, - get_key(0), Lsn(0x18)..Lsn(0x20), 4 * 1024, - &ctx, ) .await .unwrap(); @@ -642,11 +662,35 @@ mod tests { ) .await .unwrap(); - let layers = delta_writer - .finish(&tline, &ctx, get_key(10)) - .await - .unwrap(); + let layers = delta_writer.finish(&tline, &ctx).await.unwrap(); assert_eq!(layers.len(), 2); + let mut layers_iter = layers.into_iter(); + assert_eq!( + layers_iter + .next() + .unwrap() + .into_resident_layer() + .layer_desc() + .key(), + PersistentLayerKey { + key_range: get_key(0)..get_key(1), + lsn_range: Lsn(0x18)..Lsn(0x20), + is_delta: true + } + ); + assert_eq!( + layers_iter + .next() + .unwrap() + .into_resident_layer() + .layer_desc() + .key(), + PersistentLayerKey { + key_range: get_key(1)..get_key(2), + lsn_range: Lsn(0x18)..Lsn(0x20), + is_delta: true + } + ); } #[tokio::test] @@ -666,10 +710,8 @@ mod tests { tenant.conf, tline.timeline_id, tenant.tenant_shard_id, - get_key(0), Lsn(0x10)..Lsn(N as u64 * 16 + 0x10), 4 * 1024 * 1024, - &ctx, ) .await .unwrap(); @@ -687,10 +729,20 @@ mod tests { .await .unwrap(); } - let delta_layers = delta_writer - .finish(&tline, &ctx, get_key(N as u32)) - .await - .unwrap(); + let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap(); assert_eq!(delta_layers.len(), 1); + let delta_layer = delta_layers + .into_iter() + .next() + .unwrap() + .into_resident_layer(); + assert_eq!( + delta_layer.layer_desc().key(), + PersistentLayerKey { + key_range: get_key(0)..get_key(1), + lsn_range: Lsn(0x10)..Lsn(N as u64 * 16 + 0x10), + is_delta: true + } + ); } } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 6b9c8386f7..0b5c520ba7 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -19,7 +19,6 @@ use bytes::Bytes; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; -use pageserver_api::config::{CompactL0BypassPageCacheValidation, CompactL0Phase1ValueAccess}; use pageserver_api::key::KEY_SIZE; use pageserver_api::keyspace::ShardedRange; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; @@ -912,137 +911,13 @@ impl Timeline { // we're compacting, in key, LSN order. // If there's both a Value::Image and Value::WalRecord for the same (key,lsn), // then the Value::Image is ordered before Value::WalRecord. - // - // TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io - // option and validation code once we've reached confidence. - enum AllValuesIter<'a> { - PageCachedBlobIo { - all_keys_iter: VecIter<'a>, - }, - StreamingKmergeBypassingPageCache { - merge_iter: MergeIterator<'a>, - }, - ValidatingStreamingKmergeBypassingPageCache { - mode: CompactL0BypassPageCacheValidation, - merge_iter: MergeIterator<'a>, - all_keys_iter: VecIter<'a>, - }, - } - type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes - impl AllValuesIter<'_> { - async fn next_all_keys_iter( - iter: &mut VecIter<'_>, - ctx: &RequestContext, - ) -> anyhow::Result> { - let Some(DeltaEntry { - key, - lsn, - val: value_ref, - .. - }) = iter.next() - else { - return Ok(None); - }; - let value = value_ref.load(ctx).await?; - Ok(Some((*key, *lsn, value))) - } - async fn next( - &mut self, - ctx: &RequestContext, - ) -> anyhow::Result> { - match self { - AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => { - Self::next_all_keys_iter(iter, ctx).await - } - AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await, - AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async { - // advance both iterators - let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await; - let merge_iter_item = merge_iter.next().await; - // compare results & log warnings as needed - macro_rules! rate_limited_warn { - ($($arg:tt)*) => {{ - if cfg!(debug_assertions) || cfg!(feature = "testing") { - warn!($($arg)*); - panic!("CompactL0BypassPageCacheValidation failure, check logs"); - } - use once_cell::sync::Lazy; - use utils::rate_limit::RateLimit; - use std::sync::Mutex; - use std::time::Duration; - static LOGGED: Lazy> = - Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); - let mut rate_limit = LOGGED.lock().unwrap(); - rate_limit.call(|| { - warn!($($arg)*); - }); - }} - } - match (&all_keys_iter_item, &merge_iter_item) { - (Err(_), Err(_)) => { - // don't bother asserting equivality of the errors - } - (Err(all_keys), Ok(merge)) => { - rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}"); - }, - (Ok(all_keys), Err(merge)) => { - rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}"); - }, - (Ok(None), Ok(None)) => { } - (Ok(Some(all_keys)), Ok(None)) => { - rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some"); - } - (Ok(None), Ok(Some(merge))) => { - rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some"); - } - (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => { - match mode { - // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one - CompactL0BypassPageCacheValidation::KeyLsn => { - let all_keys = (all_keys_key, all_keys_lsn); - let merge = (merge_key, merge_lsn); - if all_keys != merge { - rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter"); - } - } - CompactL0BypassPageCacheValidation::KeyLsnValue => { - let all_keys = (all_keys_key, all_keys_lsn, all_keys_value); - let merge = (merge_key, merge_lsn, merge_value); - if all_keys != merge { - rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter"); - } - } - } - } - } - // in case of mismatch, trust the legacy all_keys_iter_item - all_keys_iter_item - }.instrument(info_span!("next")).await - } - } - } - let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access { - CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo { - all_keys_iter: all_keys.iter(), - }, - CompactL0Phase1ValueAccess::StreamingKmerge { validate } => { - let merge_iter = { - let mut deltas = Vec::with_capacity(deltas_to_compact.len()); - for l in deltas_to_compact.iter() { - let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; - deltas.push(l); - } - MergeIterator::create(&deltas, &[], ctx) - }; - match validate { - None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter }, - Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { - mode: validate.clone(), - merge_iter, - all_keys_iter: all_keys.iter(), - }, - } + let mut all_values_iter = { + let mut deltas = Vec::with_capacity(deltas_to_compact.len()); + for l in deltas_to_compact.iter() { + let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; + deltas.push(l); } + MergeIterator::create(&deltas, &[], ctx) }; // This iterator walks through all keys and is needed to calculate size used by each key @@ -1119,7 +994,7 @@ impl Timeline { let mut keys = 0; while let Some((key, lsn, value)) = all_values_iter - .next(ctx) + .next() .await .map_err(CompactionError::Other)? { @@ -1934,7 +1809,6 @@ impl Timeline { .unwrap(); // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized // as an L0 layer. - let hack_end_key = Key::NON_L0_MAX; let mut delta_layers = Vec::new(); let mut image_layers = Vec::new(); let mut downloaded_layers = Vec::new(); @@ -1980,10 +1854,8 @@ impl Timeline { self.conf, self.timeline_id, self.tenant_shard_id, - Key::MIN, lowest_retain_lsn..end_lsn, self.get_compaction_target_size(), - ctx, ) .await?; @@ -2090,7 +1962,7 @@ impl Timeline { let produced_image_layers = if let Some(writer) = image_layer_writer { if !dry_run { writer - .finish_with_discard_fn(self, ctx, hack_end_key, discard) + .finish_with_discard_fn(self, ctx, Key::MAX, discard) .await? } else { let (layers, _) = writer.take()?; @@ -2103,7 +1975,7 @@ impl Timeline { let produced_delta_layers = if !dry_run { delta_layer_writer - .finish_with_discard_fn(self, ctx, hack_end_key, discard) + .finish_with_discard_fn(self, ctx, discard) .await? } else { let (layers, _) = delta_layer_writer.take()?; diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 4d51dc442d..553edf6d8b 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -593,8 +593,10 @@ impl<'a> VectoredBlobReader<'a> { } } -/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for -/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and +/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. +/// +/// It provides a streaming API for getting read blobs. It returns a batch when +/// `handle` gets called and when the current key would just exceed the read_size and /// max_cnt constraints. pub struct StreamingVectoredReadPlanner { read_builder: Option, diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index ed6ff86c10..57856eea80 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -1,6 +1,7 @@ -//! //! VirtualFile is like a normal File, but it's not bound directly to -//! a file descriptor. Instead, the file is opened when it's read from, +//! a file descriptor. +//! +//! Instead, the file is opened when it's read from, //! and if too many files are open globally in the system, least-recently //! used ones are closed. //! diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 2d3841881b..6e15ad81c3 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -25,9 +25,7 @@ use std::time::Duration; use std::time::SystemTime; use pageserver_api::shard::ShardIdentity; -use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; -use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; -use postgres_ffi::TimestampTz; +use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz}; use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; use anyhow::{bail, Context, Result}; @@ -48,16 +46,31 @@ use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; -use postgres_ffi::v14::xlog_utils::*; -use postgres_ffi::v14::CheckPoint; use postgres_ffi::TransactionId; use postgres_ffi::BLCKSZ; +use utils::bin_ser::SerializeError; use utils::lsn::Lsn; +enum_pgversion! {CheckPoint, pgv::CheckPoint} + +impl CheckPoint { + fn encode(&self) -> Result { + enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.encode() }) + } + + fn update_next_xid(&mut self, xid: u32) -> bool { + enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.update_next_xid(xid) }) + } + + pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool { + enum_pgversion_dispatch!(self, CheckPoint, cp, { + cp.update_next_multixid(multi_xid, multi_offset) + }) + } +} + pub struct WalIngest { shard: ShardIdentity, - pg_version: u32, checkpoint: CheckPoint, checkpoint_modified: bool, warn_ingest_lag: WarnIngestLag, @@ -78,12 +91,16 @@ impl WalIngest { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?; - let checkpoint = CheckPoint::decode(&checkpoint_bytes)?; - trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); + let pgversion = timeline.pg_version; + + let checkpoint = dispatch_pgversion!(pgversion, { + let checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?; + trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); + >::into(checkpoint) + }); Ok(WalIngest { shard: *timeline.get_shard_identity(), - pg_version: timeline.pg_version, checkpoint, checkpoint_modified: false, warn_ingest_lag: WarnIngestLag { @@ -117,7 +134,7 @@ impl WalIngest { modification.set_lsn(lsn)?; - if decoded.is_dbase_create_copy(self.pg_version) { + if decoded.is_dbase_create_copy(pg_version) { // Records of this type should always be preceded by a commit(), as they // rely on reading data pages back from the Timeline. assert!(!modification.has_dirty_data_pages()); @@ -337,70 +354,67 @@ impl WalIngest { pg_constants::RM_XLOG_ID => { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; - if info == pg_constants::XLOG_NEXTOID { - let next_oid = buf.get_u32_le(); - if self.checkpoint.nextOid != next_oid { - self.checkpoint.nextOid = next_oid; + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { + if info == pg_constants::XLOG_NEXTOID { + let next_oid = buf.get_u32_le(); + if cp.nextOid != next_oid { + cp.nextOid = next_oid; + self.checkpoint_modified = true; + } + } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE + || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN + { + let mut checkpoint_bytes = [0u8; pgv::xlog_utils::SIZEOF_CHECKPOINT]; + buf.copy_to_slice(&mut checkpoint_bytes); + let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?; + trace!( + "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", + xlog_checkpoint.oldestXid, + cp.oldestXid + ); + if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 { + cp.oldestXid = xlog_checkpoint.oldestXid; + } + trace!( + "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}", + xlog_checkpoint.oldestActiveXid, + cp.oldestActiveXid + ); + + // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`, + // because at shutdown, all in-progress transactions will implicitly + // end. Postgres startup code knows that, and allows hot standby to start + // immediately from a shutdown checkpoint. + // + // In Neon, Postgres hot standby startup always behaves as if starting from + // an online checkpoint. It needs a valid `oldestActiveXid` value, so + // instead of overwriting self.checkpoint.oldestActiveXid with + // InvalidTransactionid from the checkpoint WAL record, update it to a + // proper value, knowing that there are no in-progress transactions at this + // point, except for prepared transactions. + // + // See also the neon code changes in the InitWalRecovery() function. + if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID + && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN + { + let mut oldest_active_xid = cp.nextXid.value as u32; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 { + oldest_active_xid = xid; + } + } + cp.oldestActiveXid = oldest_active_xid; + } else { + cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid; + } + + // Write a new checkpoint key-value pair on every checkpoint record, even + // if nothing really changed. Not strictly required, but it seems nice to + // have some trace of the checkpoint records in the layer files at the same + // LSNs. self.checkpoint_modified = true; } - } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE - || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN - { - let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT]; - buf.copy_to_slice(&mut checkpoint_bytes); - let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?; - trace!( - "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", - xlog_checkpoint.oldestXid, - self.checkpoint.oldestXid - ); - if (self - .checkpoint - .oldestXid - .wrapping_sub(xlog_checkpoint.oldestXid) as i32) - < 0 - { - self.checkpoint.oldestXid = xlog_checkpoint.oldestXid; - } - trace!( - "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}", - xlog_checkpoint.oldestActiveXid, - self.checkpoint.oldestActiveXid - ); - - // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`, - // because at shutdown, all in-progress transactions will implicitly - // end. Postgres startup code knows that, and allows hot standby to start - // immediately from a shutdown checkpoint. - // - // In Neon, Postgres hot standby startup always behaves as if starting from - // an online checkpoint. It needs a valid `oldestActiveXid` value, so - // instead of overwriting self.checkpoint.oldestActiveXid with - // InvalidTransactionid from the checkpoint WAL record, update it to a - // proper value, knowing that there are no in-progress transactions at this - // point, except for prepared transactions. - // - // See also the neon code changes in the InitWalRecovery() function. - if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID - && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN - { - let mut oldest_active_xid = self.checkpoint.nextXid.value as u32; - for xid in modification.tline.list_twophase_files(lsn, ctx).await? { - if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 { - oldest_active_xid = xid; - } - } - self.checkpoint.oldestActiveXid = oldest_active_xid; - } else { - self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid; - } - - // Write a new checkpoint key-value pair on every checkpoint record, even - // if nothing really changed. Not strictly required, but it seems nice to - // have some trace of the checkpoint records in the layer files at the same - // LSNs. - self.checkpoint_modified = true; - } + }); } pg_constants::RM_LOGICALMSG_ID => { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; @@ -424,7 +438,11 @@ impl WalIngest { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_RUNNING_XACTS { let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf); - self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid; + + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { + cp.oldestActiveXid = xlrec.oldest_running_xid; + }); + self.checkpoint_modified = true; } } @@ -539,7 +557,7 @@ impl WalIngest { && blk.has_image && decoded.xl_rmid == pg_constants::RM_XLOG_ID && (decoded.xl_info == pg_constants::XLOG_FPI - || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) + || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version) // do not materialize null pages because them most likely be soon replaced with real data @@ -1204,7 +1222,7 @@ impl WalIngest { if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 { // Tail of last remaining FSM page has to be zeroed. // We are not precise here and instead of digging in FSM bitmap format just clear the whole page. - modification.put_rel_page_image_zero(rel, fsm_physical_page_no); + modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?; fsm_physical_page_no += 1; } let nblocks = get_relsize(modification, rel, ctx).await?; @@ -1226,7 +1244,7 @@ impl WalIngest { if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 { // Tail of last remaining vm page has to be zeroed. // We are not precise here and instead of digging in VM bitmap format just clear the whole page. - modification.put_rel_page_image_zero(rel, vm_page_no); + modification.put_rel_page_image_zero(rel, vm_page_no)?; vm_page_no += 1; } let nblocks = get_relsize(modification, rel, ctx).await?; @@ -1242,12 +1260,17 @@ impl WalIngest { fn warn_on_ingest_lag( &mut self, conf: &crate::config::PageServerConf, - wal_timestmap: TimestampTz, + wal_timestamp: TimestampTz, ) { debug_assert_current_span_has_tenant_and_timeline_id(); let now = SystemTime::now(); let rate_limits = &mut self.warn_ingest_lag; - match try_from_pg_timestamp(wal_timestmap) { + + let ts = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, _cp, { + pgv::xlog_utils::try_from_pg_timestamp(wal_timestamp) + }); + + match ts { Ok(ts) => { match now.duration_since(ts) { Ok(lag) => { @@ -1257,7 +1280,7 @@ impl WalIngest { warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout"); }) } - }, + } Err(e) => { let delta_t = e.duration(); // determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds) @@ -1271,7 +1294,6 @@ impl WalIngest { } } }; - } Err(error) => { rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| { @@ -1379,14 +1401,17 @@ impl WalIngest { // truncated, but a checkpoint record with the updated values isn't written until // later. In Neon, a server can start at any LSN, not just on a checkpoint record, // so we keep the oldestXid and oldestXidDB up-to-date. - self.checkpoint.oldestXid = xlrec.oldest_xid; - self.checkpoint.oldestXidDB = xlrec.oldest_xid_db; + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { + cp.oldestXid = xlrec.oldest_xid; + cp.oldestXidDB = xlrec.oldest_xid_db; + }); self.checkpoint_modified = true; // TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it let latest_page_number = - self.checkpoint.nextXid.value as u32 / pg_constants::CLOG_XACTS_PER_PAGE; + enum_pgversion_dispatch!(self.checkpoint, CheckPoint, cp, { cp.nextXid.value }) as u32 + / pg_constants::CLOG_XACTS_PER_PAGE; // Now delete all segments containing pages between xlrec.pageno // and latest_page_number. @@ -1394,7 +1419,9 @@ impl WalIngest { // First, make an important safety check: // the current endpoint page must not be eligible for removal. // See SimpleLruTruncate() in slru.c - if clogpage_precedes(latest_page_number, xlrec.pageno) { + if dispatch_pgversion!(modification.tline.pg_version, { + pgv::nonrelfile_utils::clogpage_precedes(latest_page_number, xlrec.pageno) + }) { info!("could not truncate directory pg_xact apparent wraparound"); return Ok(()); } @@ -1411,7 +1438,12 @@ impl WalIngest { .await? { let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; - if slru_may_delete_clogsegment(segpage, xlrec.pageno) { + + let may_delete = dispatch_pgversion!(modification.tline.pg_version, { + pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, xlrec.pageno) + }); + + if may_delete { modification .drop_slru_segment(SlruKind::Clog, segno, ctx) .await?; @@ -1530,14 +1562,23 @@ impl WalIngest { xlrec: &XlMultiXactTruncate, ctx: &RequestContext, ) -> Result<()> { - self.checkpoint.oldestMulti = xlrec.end_trunc_off; - self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db; + let (maxsegment, startsegment, endsegment) = + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { + cp.oldestMulti = xlrec.end_trunc_off; + cp.oldestMultiDB = xlrec.oldest_multi_db; + let maxsegment: i32 = pgv::nonrelfile_utils::mx_offset_to_member_segment( + pg_constants::MAX_MULTIXACT_OFFSET, + ); + let startsegment: i32 = + pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.start_trunc_memb); + let endsegment: i32 = + pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.end_trunc_memb); + (maxsegment, startsegment, endsegment) + }); + self.checkpoint_modified = true; // PerformMembersTruncation - let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET); - let startsegment: i32 = mx_offset_to_member_segment(xlrec.start_trunc_memb); - let endsegment: i32 = mx_offset_to_member_segment(xlrec.end_trunc_memb); let mut segment: i32 = startsegment; // Delete all the segments except the last one. The last segment can still @@ -1696,7 +1737,7 @@ impl WalIngest { continue; } - modification.put_rel_page_image_zero(rel, gap_blknum); + modification.put_rel_page_image_zero(rel, gap_blknum)?; } } Ok(()) @@ -1762,7 +1803,7 @@ impl WalIngest { // fill the gap with zeros for gap_blknum in old_nblocks..blknum { - modification.put_slru_page_image_zero(kind, segno, gap_blknum); + modification.put_slru_page_image_zero(kind, segno, gap_blknum)?; } } Ok(()) @@ -1811,11 +1852,23 @@ mod tests { // TODO } - static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); + #[tokio::test] + async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> { + for i in 14..=16 { + dispatch_pgversion!(i, { + pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?; + }); + } + + Ok(()) + } async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result { let mut m = tline.begin_modification(Lsn(0x10)); - m.put_checkpoint(ZERO_CHECKPOINT.clone())?; + m.put_checkpoint(dispatch_pgversion!( + tline.pg_version, + pgv::ZERO_CHECKPOINT.clone() + ))?; m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file m.commit(ctx).await?; let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 82585f9ed8..a36955fa21 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -43,13 +43,12 @@ use utils::lsn::Lsn; use utils::sync::gate::GateError; use utils::sync::heavier_once_cell; +/// The real implementation that uses a Postgres process to +/// perform WAL replay. /// -/// This is the real implementation that uses a Postgres process to -/// perform WAL replay. Only one thread can use the process at a time, -/// that is controlled by the Mutex. In the future, we might want to -/// launch a pool of processes to allow concurrent replay of multiple -/// records. -/// +/// Only one thread can use the process at a time, that is controlled by the +/// Mutex. In the future, we might want to launch a pool of processes to allow +/// concurrent replay of multiple records. pub struct PostgresRedoManager { tenant_shard_id: TenantShardId, conf: &'static PageServerConf, diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index c53257923a..c1914421ec 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -1038,9 +1038,12 @@ DetermineEpochStartLsn(WalProposer *wp) if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp)) { /* - * However, allow to proceed if previously elected leader was me; - * plain restart of walproposer not intervened by concurrent - * compute (who could generate WAL) is ok. + * However, allow to proceed if last_log_term on the node which gave + * the highest vote (i.e. point where we are going to start writing) + * actually had been won by me; plain restart of walproposer not + * intervened by concurrent compute which wrote WAL is ok. + * + * This avoids compute crash after manual term_bump. */ if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm)))) @@ -1442,12 +1445,17 @@ RecvAppendResponses(Safekeeper *sk) if (sk->appendResponse.term > wp->propTerm) { /* - * Another compute with higher term is running. Panic to restart - * PG as we likely need to retake basebackup. However, don't dump - * core as this is kinda expected scenario. + * + * Term has changed to higher one, probably another compute is + * running. If this is the case we could PANIC as well because + * likely it inserted some data and our basebackup is unsuitable + * anymore. However, we also bump term manually (term_bump endpoint) + * on safekeepers for migration purposes, in this case we do want + * compute to stay alive. So restart walproposer with FATAL instead + * of panicking; if basebackup is spoiled next election will notice + * this. */ - disable_core_dump(); - wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us", + wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us", sk->host, sk->port, sk->appendResponse.term, wp->propTerm); } diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 8bb482f7c6..5b08d74696 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -16,7 +16,7 @@ use tracing::debug; // On the other hand, `hashlink` has good download stats and appears to be maintained. use hashlink::{linked_hash_map::RawEntryMut, LruCache}; -use super::{common::Cached, *}; +use super::{common::Cached, timed_lru, Cache}; /// An implementation of timed LRU cache with fixed capacity. /// Key properties: diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index 2ed4f5f206..ee5f83ee76 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -78,7 +78,7 @@ pub(crate) type ComputeReady = DatabaseInfo; // TODO: replace with an http-based protocol. struct MgmtHandler; -#[async_trait::async_trait] + impl postgres_backend::Handler for MgmtHandler { async fn process_query( &mut self, diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 33eda72e65..b004bf4ecf 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -38,10 +38,7 @@ impl Api { locks: &'static ApiLocks, wake_compute_endpoint_rate_limiter: Arc, ) -> Self { - let jwt = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") { - Ok(v) => v, - Err(_) => String::new(), - }; + let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN").unwrap_or_default(); Self { endpoint, caches, diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index c6f83fd069..fafea2a08f 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -598,15 +598,15 @@ mod tests { assert_eq!( file_stats, [ - (1315874, 3, 6000), - (1315867, 3, 6000), - (1315927, 3, 6000), - (1315884, 3, 6000), - (1316014, 3, 6000), - (1315856, 3, 6000), - (1315648, 3, 6000), - (1315884, 3, 6000), - (438913, 1, 2000) + (1312632, 3, 6000), + (1312621, 3, 6000), + (1312680, 3, 6000), + (1312637, 3, 6000), + (1312773, 3, 6000), + (1312610, 3, 6000), + (1312404, 3, 6000), + (1312639, 3, 6000), + (437848, 1, 2000) ] ); @@ -638,11 +638,11 @@ mod tests { assert_eq!( file_stats, [ - (1208861, 5, 10000), - (1208592, 5, 10000), - (1208885, 5, 10000), - (1208873, 5, 10000), - (1209128, 5, 10000) + (1203465, 5, 10000), + (1203189, 5, 10000), + (1203490, 5, 10000), + (1203475, 5, 10000), + (1203729, 5, 10000) ] ); @@ -667,15 +667,15 @@ mod tests { assert_eq!( file_stats, [ - (1315874, 3, 6000), - (1315867, 3, 6000), - (1315927, 3, 6000), - (1315884, 3, 6000), - (1316014, 3, 6000), - (1315856, 3, 6000), - (1315648, 3, 6000), - (1315884, 3, 6000), - (438913, 1, 2000) + (1312632, 3, 6000), + (1312621, 3, 6000), + (1312680, 3, 6000), + (1312637, 3, 6000), + (1312773, 3, 6000), + (1312610, 3, 6000), + (1312404, 3, 6000), + (1312639, 3, 6000), + (437848, 1, 2000) ] ); @@ -712,7 +712,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)] + [(657696, 2, 3001), (657410, 2, 3000), (657206, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 8d7e586b3d..923d6ae288 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -44,16 +44,14 @@ clippy::items_after_statements, )] // List of temporarily allowed lints. -// TODO: Switch to except() once stable with 1.81. // TODO: fix code and reduce list or move to permanent list above. -#![allow( +#![expect( clippy::cargo_common_metadata, clippy::cast_possible_truncation, clippy::cast_possible_wrap, clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::doc_markdown, - clippy::implicit_hasher, clippy::inline_always, clippy::match_same_arms, clippy::match_wild_err_arm, @@ -61,21 +59,28 @@ clippy::missing_panics_doc, clippy::module_name_repetitions, clippy::needless_pass_by_value, - clippy::needless_raw_string_hashes, clippy::redundant_closure_for_method_calls, - clippy::return_self_not_must_use, clippy::similar_names, clippy::single_match_else, clippy::struct_excessive_bools, clippy::struct_field_names, clippy::too_many_lines, - clippy::unreadable_literal, - clippy::unused_async, - clippy::unused_self, - clippy::wildcard_imports + clippy::unused_self +)] +#![cfg_attr( + any(test, feature = "testing"), + allow( + clippy::needless_raw_string_hashes, + clippy::unreadable_literal, + clippy::unused_async, + ) )] // List of temporarily allowed lints to unblock beta/nightly. -#![allow(unknown_lints, clippy::manual_inspect)] +#![allow( + unknown_lints, + // TODO: 1.82: Add `use` where necessary and remove from this list. + impl_trait_overcaptures, +)] use std::{convert::Infallible, future::Future}; diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 786cbcaa19..afb5604666 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -217,6 +217,7 @@ impl sasl::Mechanism for Exchange<'_> { self.state = ExchangeState::SaltSent(sent); Ok(Step::Continue(self, msg)) } + #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match Step::Success(x, _) => match x {}, Step::Failure(msg) => Ok(Step::Failure(msg)), } @@ -224,6 +225,7 @@ impl sasl::Mechanism for Exchange<'_> { ExchangeState::SaltSent(sent) => { match sent.transition(self.secret, &self.tls_server_end_point, input)? { Step::Success(keys, msg) => Ok(Step::Success(keys, msg)), + #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match Step::Continue(x, _) => match x {}, Step::Failure(msg) => Ok(Step::Failure(msg)), } diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 5b36f5e91d..2188edc8c5 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -745,22 +745,20 @@ impl BatchQueryData { builder = builder.deferrable(true); } - let transaction = builder.start().await.map_err(|e| { + let transaction = builder.start().await.inspect_err(|_| { // if we cannot start a transaction, we should return immediately // and not return to the pool. connection is clearly broken discard.discard(); - e })?; let json_output = match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await { Ok(json_output) => { info!("commit"); - let status = transaction.commit().await.map_err(|e| { + let status = transaction.commit().await.inspect_err(|_| { // if we cannot commit - for now don't return connection to pool // TODO: get a query status from the error discard.discard(); - e })?; discard.check_idle(status); json_output @@ -776,11 +774,10 @@ impl BatchQueryData { } Err(err) => { info!("rollback"); - let status = transaction.rollback().await.map_err(|e| { + let status = transaction.rollback().await.inspect_err(|_| { // if we cannot rollback - for now don't return connection to pool // TODO: get a query status from the error discard.discard(); - e })?; discard.check_idle(status); return Err(err); diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 332dc27787..c14dd18afe 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -14,6 +14,7 @@ use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_rustls::server::TlsStream; /// Stream wrapper which implements libpq's protocol. +/// /// NOTE: This object deliberately doesn't implement [`AsyncRead`] /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying /// to pass random malformed bytes through the connection). diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 368b8d300a..3c5d0b12a6 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,7 +1,7 @@ [toolchain] -channel = "1.80.1" +channel = "1.81.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html -# but we also need `llvm-tools-preview` for coverage data merges on CI -components = ["llvm-tools-preview", "rustfmt", "clippy"] +# but we also need `llvm-tools` for coverage data merges on CI +components = ["llvm-tools", "rustfmt", "clippy"] diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs index b8bc3f3e06..c5c9393c00 100644 --- a/safekeeper/src/auth.rs +++ b/safekeeper/src/auth.rs @@ -1,6 +1,9 @@ use utils::auth::{AuthError, Claims, Scope}; use utils::id::TenantId; +/// If tenant_id is provided, allow if token (claims) is for this tenant or +/// whole safekeeper scope (SafekeeperData). Else, allow only if token is +/// SafekeeperData. pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result<(), AuthError> { match (&claims.scope, tenant_id) { (Scope::Tenant, None) => Err(AuthError( diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 41c2d3fe08..5270934f5e 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -19,7 +19,7 @@ use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::str::FromStr; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, Instant}; use storage_broker::Uri; use tracing::*; @@ -261,6 +261,15 @@ async fn main() -> anyhow::Result<()> { // Change into the data directory. std::env::set_current_dir(&workdir)?; + // Prevent running multiple safekeepers on the same directory + let lock_file_path = workdir.join(PID_FILE_NAME); + let lock_file = + pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?; + info!("claimed pid file at {lock_file_path:?}"); + // ensure that the lock file is held even if the main thread of the process is panics + // we need to release the lock file only when the current process is gone + std::mem::forget(lock_file); + // Set or read our ID. let id = set_id(&workdir, args.id.map(NodeId))?; if args.init { @@ -364,15 +373,15 @@ async fn main() -> anyhow::Result<()> { type JoinTaskRes = Result, JoinError>; async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { - // Prevent running multiple safekeepers on the same directory - let lock_file_path = conf.workdir.join(PID_FILE_NAME); - let lock_file = - pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?; - info!("claimed pid file at {lock_file_path:?}"); - - // ensure that the lock file is held even if the main thread of the process is panics - // we need to release the lock file only when the current process is gone - std::mem::forget(lock_file); + // fsync the datadir to make sure we have a consistent state on disk. + let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?; + let started = Instant::now(); + utils::crashsafe::syncfs(dfd)?; + let elapsed = started.elapsed(); + info!( + elapsed_ms = elapsed.as_millis(), + "syncfs data directory done" + ); info!("starting safekeeper WAL service on {}", conf.listen_pg_addr); let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| { diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 2c519433ef..3f00b69cde 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -2,6 +2,7 @@ //! protocol commands. use anyhow::Context; +use std::future::Future; use std::str::{self, FromStr}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; @@ -95,7 +96,6 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { } } -#[async_trait::async_trait] impl postgres_backend::Handler for SafekeeperPostgresHandler { @@ -197,49 +197,51 @@ impl postgres_backend::Handler Ok(()) } - async fn process_query( + fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, - ) -> Result<(), QueryError> { - if query_string - .to_ascii_lowercase() - .starts_with("set datestyle to ") - { - // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - return Ok(()); - } - - let cmd = parse_cmd(query_string)?; - let cmd_str = cmd_to_string(&cmd); - - let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard(); - - info!("got query {:?}", query_string); - - let tenant_id = self.tenant_id.context("tenantid is required")?; - let timeline_id = self.timeline_id.context("timelineid is required")?; - self.check_permission(Some(tenant_id))?; - self.ttid = TenantTimelineId::new(tenant_id, timeline_id); - - match cmd { - SafekeeperPostgresCommand::StartWalPush => { - self.handle_start_wal_push(pgb) - .instrument(info_span!("WAL receiver")) - .await + ) -> impl Future> { + Box::pin(async move { + if query_string + .to_ascii_lowercase() + .starts_with("set datestyle to ") + { + // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + return Ok(()); } - SafekeeperPostgresCommand::StartReplication { start_lsn, term } => { - self.handle_start_replication(pgb, start_lsn, term) - .instrument(info_span!("WAL sender")) - .await + + let cmd = parse_cmd(query_string)?; + let cmd_str = cmd_to_string(&cmd); + + let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard(); + + info!("got query {:?}", query_string); + + let tenant_id = self.tenant_id.context("tenantid is required")?; + let timeline_id = self.timeline_id.context("timelineid is required")?; + self.check_permission(Some(tenant_id))?; + self.ttid = TenantTimelineId::new(tenant_id, timeline_id); + + match cmd { + SafekeeperPostgresCommand::StartWalPush => { + self.handle_start_wal_push(pgb) + .instrument(info_span!("WAL receiver")) + .await + } + SafekeeperPostgresCommand::StartReplication { start_lsn, term } => { + self.handle_start_replication(pgb, start_lsn, term) + .instrument(info_span!("WAL sender")) + .await + } + SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await, + SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await, + SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { + handle_json_ctrl(self, pgb, cmd).await + } } - SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await, - SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await, - SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { - handle_json_ctrl(self, pgb, cmd).await - } - } + }) } } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 9b7424a818..e482edea55 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -18,8 +18,8 @@ use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWri use utils::http::request::parse_query_param; use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::models::TimelineCreateRequest; use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest}; +use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest}; use utils::{ auth::SwappableJwtAuth, http::{ @@ -408,6 +408,28 @@ async fn timeline_backup_partial_reset(request: Request) -> Result, +) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let request_data: TimelineTermBumpRequest = json_request(&mut request).await?; + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + let response = tli + .term_bump(request_data.term) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, response) +} + /// Used only in tests to hand craft required data. async fn record_safekeeper_info(mut request: Request) -> Result, ApiError> { let ttid = TenantTimelineId::new( @@ -630,6 +652,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder "/v1/tenant/:tenant_id/timeline/:timeline_id/backup_partial_reset", |r| request_span(r, timeline_backup_partial_reset), ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/term_bump", + |r| request_span(r, timeline_term_bump_handler), + ) .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| { request_span(r, record_safekeeper_info) }) diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 600a6bd8f0..64585f5edc 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -484,6 +484,7 @@ pub async fn validate_temp_timeline( } /// Move timeline from a temp directory to the main storage, and load it to the global map. +/// /// This operation is done under a lock to prevent bugs if several concurrent requests are /// trying to load the same timeline. Note that it doesn't guard against creating the /// timeline with the same ttid, but no one should be doing this anyway. diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index ab8c76dc17..e35f806e90 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -448,8 +448,10 @@ async fn network_write( const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1); /// Encapsulates a task which takes messages from msg_rx, processes and pushes -/// replies to reply_tx; reading from socket and writing to disk in parallel is -/// beneficial for performance, this struct provides writing to disk part. +/// replies to reply_tx. +/// +/// Reading from socket and writing to disk in parallel is beneficial for +/// performance, this struct provides the writing to disk part. pub struct WalAcceptor { tli: WalResidentTimeline, msg_rx: Receiver, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index dbe0034de2..b3e006ab05 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -938,8 +938,9 @@ where } trace!( - "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}", + "processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}", msg.wal_data.len(), + msg.h.begin_lsn, msg.h.end_lsn, msg.h.commit_lsn, msg.h.truncate_lsn, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 90b1604adb..6d677f405a 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -758,9 +758,8 @@ impl ReplyReader { // pq_sendint32(&reply_message, xmin); // pq_sendint32(&reply_message, xmin_epoch); // So it is two big endian 32-bit words in low endian order! - hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32); - hs_feedback.catalog_xmin = - (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32); + hs_feedback.xmin = hs_feedback.xmin.rotate_left(32); + hs_feedback.catalog_xmin = hs_feedback.catalog_xmin.rotate_left(32); self.ws_guard .walsenders .record_hs_feedback(self.ws_guard.id, &hs_feedback); diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index dca6414082..8ae749ded5 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -1,9 +1,10 @@ //! Defines per timeline data stored persistently (SafeKeeperPersistentState) //! and its wrapper with in memory layer (SafekeeperState). -use std::ops::Deref; +use std::{cmp::max, ops::Deref}; use anyhow::Result; +use safekeeper_api::models::TimelineTermBumpResponse; use serde::{Deserialize, Serialize}; use utils::{ id::{NodeId, TenantId, TenantTimelineId, TimelineId}, @@ -12,7 +13,7 @@ use utils::{ use crate::{ control_file, - safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory}, + safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory}, wal_backup_partial::{self}, }; @@ -147,9 +148,11 @@ pub struct TimelineMemState { pub proposer_uuid: PgUuid, } -/// Safekeeper persistent state plus in memory layer, to avoid frequent fsyncs -/// when we update fields like commit_lsn which don't need immediate -/// persistence. Provides transactional like API to atomically update the state. +/// Safekeeper persistent state plus in memory layer. +/// +/// Allows us to avoid frequent fsyncs when we update fields like commit_lsn +/// which don't need immediate persistence. Provides transactional like API +/// to atomically update the state. /// /// Implements Deref into *persistent* part. pub struct TimelineState { @@ -209,6 +212,27 @@ where let s = self.start_change(); self.finish_change(&s).await } + + /// Make term at least as `to`. If `to` is None, increment current one. This + /// is not in safekeeper.rs because we want to be able to do it even if + /// timeline is offloaded. + pub async fn term_bump(&mut self, to: Option) -> Result { + let before = self.acceptor_state.term; + let mut state = self.start_change(); + let new = match to { + Some(to) => max(state.acceptor_state.term, to), + None => state.acceptor_state.term + 1, + }; + if new > state.acceptor_state.term { + state.acceptor_state.term = new; + self.finish_change(&state).await?; + } + let after = self.acceptor_state.term; + Ok(TimelineTermBumpResponse { + previous_term: before, + current_term: after, + }) + } } impl Deref for TimelineState diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 95ee925e1a..fb98534768 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -4,6 +4,7 @@ use anyhow::{anyhow, bail, Result}; use camino::Utf8PathBuf; use remote_storage::RemotePath; +use safekeeper_api::models::TimelineTermBumpResponse; use serde::{Deserialize, Serialize}; use tokio::fs::{self}; use tokio_util::sync::CancellationToken; @@ -169,6 +170,7 @@ impl<'a> Drop for WriteGuardSharedState<'a> { } /// This structure is stored in shared state and represents the state of the timeline. +/// /// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this /// case, SafeKeeper is not available (because WAL is not present on disk) and all /// operations can be done only with control file. @@ -214,6 +216,10 @@ impl StateSK { .get_last_log_term(self.flush_lsn()) } + pub async fn term_bump(&mut self, to: Option) -> Result { + self.state_mut().term_bump(to).await + } + /// Close open WAL files to release FDs. fn close_wal_store(&mut self) { if let StateSK::Loaded(sk) = self { @@ -853,6 +859,11 @@ impl Timeline { Ok(res) } + pub async fn term_bump(self: &Arc, to: Option) -> Result { + let mut state = self.write_shared_state().await; + state.sk.term_bump(to).await + } + /// Get the timeline guard for reading/writing WAL files. /// If WAL files are not present on disk (evicted), they will be automatically /// downloaded from remote storage. This is done in the manager task, which is diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 5d0567575c..5aa4921a92 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -1,6 +1,8 @@ -//! Code related to evicting WAL files to remote storage. The actual upload is done by the -//! partial WAL backup code. This file has code to delete and re-download WAL files, -//! cross-validate with partial WAL backup if local file is still present. +//! Code related to evicting WAL files to remote storage. +//! +//! The actual upload is done by the partial WAL backup code. This file has +//! code to delete and re-download WAL files, cross-validate with partial WAL +//! backup if local file is still present. use anyhow::Context; use camino::Utf8PathBuf; diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs index dbdf46412d..1ddac573d2 100644 --- a/safekeeper/src/timeline_guard.rs +++ b/safekeeper/src/timeline_guard.rs @@ -1,4 +1,6 @@ -//! Timeline residence guard is needed to ensure that WAL segments are present on disk, +//! Timeline residence guard +//! +//! It is needed to ensure that WAL segments are present on disk, //! as long as the code is holding the guard. This file implements guard logic, to issue //! and drop guards, and to notify the manager when the guard is dropped. diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index f997f48454..6be75479db 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -1,4 +1,5 @@ //! The timeline manager task is responsible for managing the timeline's background tasks. +//! //! It is spawned alongside each timeline and exits when the timeline is deleted. //! It watches for changes in the timeline state and decides when to spawn or kill background tasks. //! It also can manage some reactive state, like should the timeline be active for broker pushes or not. diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs index d6eea79f82..096e348295 100644 --- a/safekeeper/src/timelines_set.rs +++ b/safekeeper/src/timelines_set.rs @@ -60,7 +60,8 @@ impl TimelinesSet { } } -/// Guard is used to add or remove timeline from the set. +/// Guard is used to add or remove timelines from the set. +/// /// If the timeline present in set, it will be removed from it on drop. /// Note: do not use more than one guard for the same timeline, it caches the presence state. /// It is designed to be used in the manager task only. diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 4050a82fff..bddfca50e4 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -1,6 +1,8 @@ //! Safekeeper timeline has a background task which is subscribed to `commit_lsn` -//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn` -//! was changed), the segment will be uploaded to S3 in about 15 minutes. +//! and `flush_lsn` updates. +//! +//! After the partial segment was updated (`flush_lsn` was changed), the segment +//! will be uploaded to S3 within the configured `partial_backup_timeout`. //! //! The filename format for partial segments is //! `Segment_Term_Flush_Commit_skNN.partial`, where: diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 16f7748eb4..1ab54d4cce 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -17,6 +17,7 @@ use crate::SafeKeeperConf; use postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. +/// /// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access /// to any tenant are allowed) or Tenant (only tokens giving access to specific /// tenant are allowed). Doesn't matter if auth is disabled in conf. diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 89c2e98a94..46c260901d 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -98,7 +98,19 @@ pub struct PhysicalStorage { /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. write_lsn: Lsn, - /// The LSN of the last WAL record written to disk. Still can be not fully flushed. + /// The LSN of the last WAL record written to disk. Still can be not fully + /// flushed. + /// + /// Note: Normally it (and flush_record_lsn) is <= write_lsn, but after xlog + /// switch ingest the reverse is true because we don't bump write_lsn up to + /// the next segment: WAL stream from the compute doesn't have the gap and + /// for simplicity / as a sanity check we disallow any non-sequential + /// writes, so write zeros as is. + /// + /// Similar effect is in theory possible due to LSN alignment: if record + /// ends at *2, decoder will report end lsn as *8 even though we haven't + /// written these zeros yet. In practice compute likely never sends + /// non-aligned chunks of data. write_record_lsn: Lsn, /// The LSN of the last WAL record flushed to disk. @@ -167,8 +179,7 @@ impl PhysicalStorage { ) }; - // TODO: do we really know that write_lsn is fully flushed to disk? - // If not, maybe it's better to call fsync() here to be sure? + // note: this assumes we fsync'ed whole datadir on start. let flush_lsn = write_lsn; debug!( @@ -440,11 +451,12 @@ impl Storage for PhysicalStorage { .with_label_values(&["truncate_wal"]) .start_timer(); - // Streaming must not create a hole, so truncate cannot be called on non-written lsn - if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { + // Streaming must not create a hole, so truncate cannot be called on + // non-written lsn. + if self.write_record_lsn != Lsn(0) && end_pos > self.write_record_lsn { bail!( - "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}", - self.write_lsn, + "truncate_wal called on non-written WAL, write_record_lsn={}, end_pos={}", + self.write_record_lsn, end_pos ); } diff --git a/scripts/coverage b/scripts/coverage index 52a69c93b9..482dc58ff6 100755 --- a/scripts/coverage +++ b/scripts/coverage @@ -134,7 +134,7 @@ class LLVM: # Show a user-friendly warning raise Exception(' '.join([ f"It appears that you don't have `{name}` installed.", - "Please execute `rustup component add llvm-tools-preview`,", + "Please execute `rustup component add llvm-tools`,", "or install it via your package manager of choice.", "LLVM tools should be the same version as LLVM in `rustc --version --verbose`.", ])) @@ -518,7 +518,7 @@ def main() -> None: example = f""" prerequisites: # alternatively, install a system package for `llvm-tools` - rustup component add llvm-tools-preview + rustup component add llvm-tools self-contained example: {app} run make diff --git a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql new file mode 100644 index 0000000000..4fcb928533 --- /dev/null +++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ALTER availability_zone_id DROP NOT NULL; diff --git a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql new file mode 100644 index 0000000000..c5b4534087 --- /dev/null +++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ALTER availability_zone_id SET NOT NULL; diff --git a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql new file mode 100644 index 0000000000..127972a2e4 --- /dev/null +++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql @@ -0,0 +1 @@ +ALTER TABLE tenant_shards DROP preferred_az_id; diff --git a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql new file mode 100644 index 0000000000..641a54feb2 --- /dev/null +++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql @@ -0,0 +1 @@ +ALTER TABLE tenant_shards ADD preferred_az_id VARCHAR; diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 32882c201a..a6638f5191 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -14,14 +14,14 @@ use metrics::{BuildInfo, NeonMetrics}; use pageserver_api::controller_api::{ MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse, MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, - TenantCreateRequest, + ShardsPreferredAzsRequest, TenantCreateRequest, }; use pageserver_api::models::{ TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, }; use pageserver_api::shard::TenantShardId; -use pageserver_client::mgmt_api; +use pageserver_client::{mgmt_api, BlockUnblock}; use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; @@ -369,6 +369,23 @@ async fn handle_tenant_timeline_detach_ancestor( json_response(StatusCode::OK, res) } +async fn handle_tenant_timeline_block_unblock_gc( + service: Arc, + req: Request, + dir: BlockUnblock, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + service + .tenant_timeline_block_unblock_gc(tenant_id, timeline_id, dir) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn handle_tenant_timeline_passthrough( service: Arc, req: Request, @@ -539,6 +556,17 @@ async fn handle_node_status(req: Request) -> Result, ApiErr json_response(StatusCode::OK, node_status) } +async fn handle_node_shards(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + + let node_status = state.service.get_node_shards(node_id).await?; + + json_response(StatusCode::OK, node_status) +} + async fn handle_get_leader(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -688,6 +716,18 @@ async fn handle_tenant_update_policy(mut req: Request) -> Result) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let azs_req = json_request::(&mut req).await?; + let state = get_state(&req); + + json_response( + StatusCode::OK, + state.service.update_shards_preferred_azs(azs_req).await?, + ) +} + async fn handle_step_down(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -1097,6 +1137,13 @@ pub fn make_router( .get("/control/v1/node/:node_id", |r| { named_request_span(r, handle_node_status, RequestName("control_v1_node_status")) }) + .get("/control/v1/node/:node_id/shards", |r| { + named_request_span( + r, + handle_node_shards, + RequestName("control_v1_node_describe"), + ) + }) .get("/control/v1/leader", |r| { named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader")) }) @@ -1174,6 +1221,13 @@ pub fn make_router( RequestName("control_v1_tenant_policy"), ) }) + .put("/control/v1/preferred_azs", |r| { + named_request_span( + r, + handle_update_preferred_azs, + RequestName("control_v1_preferred_azs"), + ) + }) .put("/control/v1/step_down", |r| { named_request_span(r, handle_step_down, RequestName("control_v1_step_down")) }) @@ -1255,6 +1309,26 @@ pub fn make_router( ) }, ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/block_gc", + |r| { + tenant_service_handler( + r, + |s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Block), + RequestName("v1_tenant_timeline_block_unblock_gc"), + ) + }, + ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/unblock_gc", + |r| { + tenant_service_handler( + r, + |s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Unblock), + RequestName("v1_tenant_timeline_block_unblock_gc"), + ) + }, + ) // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 73cecc491d..cb9ce10d23 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -36,7 +36,7 @@ pub(crate) struct Node { listen_pg_addr: String, listen_pg_port: u16, - availability_zone_id: Option, + availability_zone_id: String, // This cancellation token means "stop any RPCs in flight to this node, and don't start // any more". It is not related to process shutdown. @@ -63,8 +63,9 @@ impl Node { self.id } - pub(crate) fn get_availability_zone_id(&self) -> Option<&str> { - self.availability_zone_id.as_deref() + #[allow(unused)] + pub(crate) fn get_availability_zone_id(&self) -> &str { + self.availability_zone_id.as_str() } pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy { @@ -78,22 +79,12 @@ impl Node { /// Does this registration request match `self`? This is used when deciding whether a registration /// request should be allowed to update an existing record with the same node ID. pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool { - let az_ids_match = { - match ( - self.availability_zone_id.as_deref(), - register_req.availability_zone_id.as_deref(), - ) { - (Some(current_az), Some(register_req_az)) => current_az == register_req_az, - _ => true, - } - }; - - az_ids_match - && self.id == register_req.node_id + self.id == register_req.node_id && self.listen_http_addr == register_req.listen_http_addr && self.listen_http_port == register_req.listen_http_port && self.listen_pg_addr == register_req.listen_pg_addr && self.listen_pg_port == register_req.listen_pg_port + && self.availability_zone_id == register_req.availability_zone_id } /// For a shard located on this node, populate a response object @@ -190,7 +181,7 @@ impl Node { listen_http_port: u16, listen_pg_addr: String, listen_pg_port: u16, - availability_zone_id: Option, + availability_zone_id: String, ) -> Self { Self { id, diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 20770ed703..961a1f78dd 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -7,7 +7,10 @@ use pageserver_api::{ }, shard::TenantShardId, }; -use pageserver_client::mgmt_api::{Client, Result}; +use pageserver_client::{ + mgmt_api::{Client, Result}, + BlockUnblock, +}; use reqwest::StatusCode; use utils::id::{NodeId, TenantId, TimelineId}; @@ -258,6 +261,24 @@ impl PageserverClient { ) } + pub(crate) async fn timeline_block_unblock_gc( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + dir: BlockUnblock, + ) -> Result<()> { + // measuring these makes no sense because we synchronize with the gc loop and remote + // storage on block_gc so there should be huge outliers + measured_request!( + "timeline_block_unblock_gc", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir) + .await + ) + } + pub(crate) async fn get_utilization(&self) -> Result { measured_request!( "utilization", diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index e801289752..1dc1040d96 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -105,7 +105,7 @@ pub(crate) enum DatabaseOperation { ListMetadataHealthOutdated, GetLeader, UpdateLeader, - SetNodeAzId, + SetPreferredAzs, } #[must_use] @@ -325,31 +325,6 @@ impl Persistence { } } - pub(crate) async fn set_node_availability_zone_id( - &self, - input_node_id: NodeId, - input_az_id: String, - ) -> DatabaseResult<()> { - use crate::schema::nodes::dsl::*; - let updated = self - .with_measured_conn(DatabaseOperation::SetNodeAzId, move |conn| { - let updated = diesel::update(nodes) - .filter(node_id.eq(input_node_id.0 as i64)) - .set((availability_zone_id.eq(input_az_id.clone()),)) - .execute(conn)?; - Ok(updated) - }) - .await?; - - if updated != 1 { - Err(DatabaseError::Logical(format!( - "Node {node_id:?} not found for setting az id", - ))) - } else { - Ok(()) - } - } - /// At startup, load the high level state for shards, such as their config + policy. This will /// be enriched at runtime with state discovered on pageservers. pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult> { @@ -690,6 +665,33 @@ impl Persistence { Ok(()) } + pub(crate) async fn set_tenant_shard_preferred_azs( + &self, + preferred_azs: Vec<(TenantShardId, String)>, + ) -> DatabaseResult> { + use crate::schema::tenant_shards::dsl::*; + + self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| { + let mut shards_updated = Vec::default(); + + for (tenant_shard_id, preferred_az) in preferred_azs.iter() { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(preferred_az_id.eq(preferred_az)) + .execute(conn)?; + + if updated == 1 { + shards_updated.push((*tenant_shard_id, preferred_az.clone())); + } + } + + Ok(shards_updated) + }) + .await + } + pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::Detach, move |conn| { @@ -1076,6 +1078,11 @@ pub(crate) struct TenantShardPersistence { pub(crate) config: String, #[serde(default)] pub(crate) scheduling_policy: String, + + // Hint that we should attempt to schedule this tenant shard the given + // availability zone in order to minimise the chances of cross-AZ communication + // with compute. + pub(crate) preferred_az_id: Option, } impl TenantShardPersistence { @@ -1110,7 +1117,7 @@ pub(crate) struct NodePersistence { pub(crate) listen_http_port: i32, pub(crate) listen_pg_addr: String, pub(crate) listen_pg_port: i32, - pub(crate) availability_zone_id: Option, + pub(crate) availability_zone_id: String, } /// Tenant metadata health status that are stored durably. diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index ef4da6861c..deb5f27226 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -528,7 +528,7 @@ pub(crate) mod test_utils { 80 + i as u16, format!("pghost-{i}"), 5432 + i as u16, - None, + "test-az".to_string(), ); node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0))); assert!(node.is_available()); diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index e0f515daea..1717a9369d 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -25,7 +25,7 @@ diesel::table! { listen_http_port -> Int4, listen_pg_addr -> Varchar, listen_pg_port -> Int4, - availability_zone_id -> Nullable, + availability_zone_id -> Varchar, } } @@ -41,6 +41,7 @@ diesel::table! { splitting -> Int2, config -> Text, scheduling_policy -> Varchar, + preferred_az_id -> Nullable, } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index ca416095bb..be3efaf688 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -25,7 +25,7 @@ use crate::{ ShardGenerationState, TenantFilter, }, reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder}, - scheduler::{MaySchedule, ScheduleContext, ScheduleMode}, + scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode}, tenant_shard::{ MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization, ScheduleOptimizationAction, @@ -41,7 +41,8 @@ use itertools::Itertools; use pageserver_api::{ controller_api::{ MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, - NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest, + NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy, + ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, @@ -68,7 +69,7 @@ use pageserver_api::{ ValidateResponse, ValidateResponseTenant, }, }; -use pageserver_client::mgmt_api; +use pageserver_client::{mgmt_api, BlockUnblock}; use tokio::sync::mpsc::error::TrySendError; use tokio_util::sync::CancellationToken; use utils::{ @@ -116,7 +117,9 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30); /// How long a node may be unresponsive to heartbeats during start up before we declare it -/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's +/// offline. +/// +/// This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's /// handling of the re-attach response may take a long time and blocks heartbeats from /// being handled on the pageserver side. pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); @@ -139,6 +142,7 @@ enum TenantOperations { AttachHook, TimelineArchivalConfig, TimelineDetachAncestor, + TimelineGcBlockUnblock, } #[derive(Clone, strum_macros::Display)] @@ -353,6 +357,12 @@ impl From for ApiError { } } +enum InitialShardScheduleOutcome { + Scheduled(TenantCreateResponseShard), + NotScheduled, + ShardScheduleError(ScheduleError), +} + pub struct Service { inner: Arc>, config: Config, @@ -442,7 +452,7 @@ struct ShardSplitParams { // When preparing for a shard split, we may either choose to proceed with the split, // or find that the work is already done and return NoOp. enum ShardSplitAction { - Split(ShardSplitParams), + Split(Box), NoOp(TenantShardSplitResponse), } @@ -1264,7 +1274,7 @@ impl Service { 123, "".to_string(), 123, - None, + "test_az".to_string(), ); scheduler.node_upsert(&node); @@ -1452,6 +1462,7 @@ impl Service { splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) .unwrap(), + preferred_az_id: None, }; match self.persistence.insert_tenant_shards(vec![tsp]).await { @@ -2023,6 +2034,7 @@ impl Service { splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) .unwrap(), + preferred_az_id: None, }) .collect(); @@ -2046,99 +2058,87 @@ impl Service { }; let mut schedule_context = ScheduleContext::default(); + let mut schedule_error = None; + let mut response_shards = Vec::new(); + for tenant_shard_id in create_ids { + tracing::info!("Creating shard {tenant_shard_id}..."); - let (waiters, response_shards) = { + let outcome = self + .do_initial_shard_scheduling( + tenant_shard_id, + initial_generation, + &create_req.shard_parameters, + create_req.config.clone(), + placement_policy.clone(), + &mut schedule_context, + ) + .await; + + match outcome { + InitialShardScheduleOutcome::Scheduled(resp) => response_shards.push(resp), + InitialShardScheduleOutcome::NotScheduled => {} + InitialShardScheduleOutcome::ShardScheduleError(err) => { + schedule_error = Some(err); + } + } + } + + let preferred_azs = { + let locked = self.inner.read().unwrap(); + response_shards + .iter() + .filter_map(|resp| { + let az_id = locked + .nodes + .get(&resp.node_id) + .map(|n| n.get_availability_zone_id().to_string())?; + + Some((resp.shard_id, az_id)) + }) + .collect::>() + }; + + // Note that we persist the preferred AZ for the new shards separately. + // In theory, we could "peek" the scheduler to determine where the shard will + // land, but the subsequent "real" call into the scheduler might select a different + // node. Hence, we do this awkward update to keep things consistent. + let updated = self + .persistence + .set_tenant_shard_preferred_azs(preferred_azs) + .await + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "Failed to persist preferred az ids: {err}" + )) + })?; + + { let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, scheduler) = locked.parts_mut(); - - let mut response_shards = Vec::new(); - let mut schcedule_error = None; - - for tenant_shard_id in create_ids { - tracing::info!("Creating shard {tenant_shard_id}..."); - - use std::collections::btree_map::Entry; - match tenants.entry(tenant_shard_id) { - Entry::Occupied(mut entry) => { - tracing::info!( - "Tenant shard {tenant_shard_id} already exists while creating" - ); - - // TODO: schedule() should take an anti-affinity expression that pushes - // attached and secondary locations (independently) away frorm those - // pageservers also holding a shard for this tenant. - - entry - .get_mut() - .schedule(scheduler, &mut schedule_context) - .map_err(|e| { - ApiError::Conflict(format!( - "Failed to schedule shard {tenant_shard_id}: {e}" - )) - })?; - - if let Some(node_id) = entry.get().intent.get_attached() { - let generation = entry - .get() - .generation - .expect("Generation is set when in attached mode"); - response_shards.push(TenantCreateResponseShard { - shard_id: tenant_shard_id, - node_id: *node_id, - generation: generation.into().unwrap(), - }); - } - - continue; - } - Entry::Vacant(entry) => { - let state = entry.insert(TenantShard::new( - tenant_shard_id, - ShardIdentity::from_params( - tenant_shard_id.shard_number, - &create_req.shard_parameters, - ), - placement_policy.clone(), - )); - - state.generation = initial_generation; - state.config = create_req.config.clone(); - if let Err(e) = state.schedule(scheduler, &mut schedule_context) { - schcedule_error = Some(e); - } - - // Only include shards in result if we are attaching: the purpose - // of the response is to tell the caller where the shards are attached. - if let Some(node_id) = state.intent.get_attached() { - let generation = state - .generation - .expect("Generation is set when in attached mode"); - response_shards.push(TenantCreateResponseShard { - shard_id: tenant_shard_id, - node_id: *node_id, - generation: generation.into().unwrap(), - }); - } - } - }; + for (tid, az_id) in updated { + if let Some(shard) = locked.tenants.get_mut(&tid) { + shard.set_preferred_az(az_id); + } } + } - // If we failed to schedule shards, then they are still created in the controller, - // but we return an error to the requester to avoid a silent failure when someone - // tries to e.g. create a tenant whose placement policy requires more nodes than - // are present in the system. We do this here rather than in the above loop, to - // avoid situations where we only create a subset of shards in the tenant. - if let Some(e) = schcedule_error { - return Err(ApiError::Conflict(format!( - "Failed to schedule shard(s): {e}" - ))); - } + // If we failed to schedule shards, then they are still created in the controller, + // but we return an error to the requester to avoid a silent failure when someone + // tries to e.g. create a tenant whose placement policy requires more nodes than + // are present in the system. We do this here rather than in the above loop, to + // avoid situations where we only create a subset of shards in the tenant. + if let Some(e) = schedule_error { + return Err(ApiError::Conflict(format!( + "Failed to schedule shard(s): {e}" + ))); + } - let waiters = tenants + let waiters = { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + tenants .range_mut(TenantShardId::tenant_range(tenant_id)) .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes)) - .collect::>(); - (waiters, response_shards) + .collect::>() }; Ok(( @@ -2149,6 +2149,78 @@ impl Service { )) } + /// Helper for tenant creation that does the scheduling for an individual shard. Covers both the + /// case of a new tenant and a pre-existing one. + async fn do_initial_shard_scheduling( + &self, + tenant_shard_id: TenantShardId, + initial_generation: Option, + shard_params: &ShardParameters, + config: TenantConfig, + placement_policy: PlacementPolicy, + schedule_context: &mut ScheduleContext, + ) -> InitialShardScheduleOutcome { + let mut locked = self.inner.write().unwrap(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + + use std::collections::btree_map::Entry; + match tenants.entry(tenant_shard_id) { + Entry::Occupied(mut entry) => { + tracing::info!("Tenant shard {tenant_shard_id} already exists while creating"); + + // TODO: schedule() should take an anti-affinity expression that pushes + // attached and secondary locations (independently) away frorm those + // pageservers also holding a shard for this tenant. + + if let Err(err) = entry.get_mut().schedule(scheduler, schedule_context) { + return InitialShardScheduleOutcome::ShardScheduleError(err); + } + + if let Some(node_id) = entry.get().intent.get_attached() { + let generation = entry + .get() + .generation + .expect("Generation is set when in attached mode"); + InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard { + shard_id: tenant_shard_id, + node_id: *node_id, + generation: generation.into().unwrap(), + }) + } else { + InitialShardScheduleOutcome::NotScheduled + } + } + Entry::Vacant(entry) => { + let state = entry.insert(TenantShard::new( + tenant_shard_id, + ShardIdentity::from_params(tenant_shard_id.shard_number, shard_params), + placement_policy, + )); + + state.generation = initial_generation; + state.config = config; + if let Err(e) = state.schedule(scheduler, schedule_context) { + return InitialShardScheduleOutcome::ShardScheduleError(e); + } + + // Only include shards in result if we are attaching: the purpose + // of the response is to tell the caller where the shards are attached. + if let Some(node_id) = state.intent.get_attached() { + let generation = state + .generation + .expect("Generation is set when in attached mode"); + InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard { + shard_id: tenant_shard_id, + node_id: *node_id, + generation: generation.into().unwrap(), + }) + } else { + InitialShardScheduleOutcome::NotScheduled + } + } + } + } + /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded /// wait for reconciliation to complete before responding. async fn await_waiters( @@ -3126,6 +3198,57 @@ impl Service { }).await? } + pub(crate) async fn tenant_timeline_block_unblock_gc( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + dir: BlockUnblock, + ) -> Result<(), ApiError> { + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineGcBlockUnblock, + ) + .await; + + self.tenant_remote_mutation(tenant_id, move |targets| async move { + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + } + + async fn do_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + dir: BlockUnblock, + ) -> Result<(), ApiError> { + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + + client + .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir) + .await + .map_err(|e| passthrough_api_error(&node, e)) + } + + // no shard needs to go first/last; the operation should be idempotent + self.tenant_for_shards(targets, |tenant_shard_id, node| { + futures::FutureExt::boxed(do_one( + tenant_shard_id, + timeline_id, + node, + self.config.jwt_token.clone(), + dir, + )) + }) + .await + }) + .await??; + Ok(()) + } + /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. /// /// On success, the returned vector contains exactly the same number of elements as the input `locations`. @@ -3511,6 +3634,7 @@ impl Service { is_pending_compute_notification: shard.pending_compute_notification, is_splitting: matches!(shard.splitting, SplitState::Splitting), scheduling_policy: *shard.get_scheduling_policy(), + preferred_az_id: shard.preferred_az().map(ToString::to_string), }) } @@ -4114,7 +4238,7 @@ impl Service { let policy = policy.unwrap(); let config = config.unwrap(); - Ok(ShardSplitAction::Split(ShardSplitParams { + Ok(ShardSplitAction::Split(Box::new(ShardSplitParams { old_shard_count, new_shard_count: ShardCount::new(split_req.new_shard_count), new_stripe_size: split_req.new_stripe_size, @@ -4122,13 +4246,13 @@ impl Service { policy, config, shard_ident, - })) + }))) } async fn do_tenant_shard_split( &self, tenant_id: TenantId, - params: ShardSplitParams, + params: Box, ) -> Result<(TenantShardSplitResponse, Vec), ApiError> { // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another // request could occur here, deleting or mutating the tenant. begin_shard_split checks that the @@ -4144,7 +4268,7 @@ impl Service { policy, config, shard_ident, - } = params; + } = *params; // Drop any secondary locations: pageservers do not support splitting these, and in any case the // end-state for a split tenant will usually be to have secondary locations on different nodes. @@ -4214,9 +4338,10 @@ impl Service { config: serde_json::to_string(&config).unwrap(), splitting: SplitState::Splitting, - // Scheduling policies do not carry through to children + // Scheduling policies and preferred AZ do not carry through to children scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) .unwrap(), + preferred_az_id: None, }); } @@ -4336,6 +4461,47 @@ impl Service { let (response, child_locations, waiters) = self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size); + // Now that we have scheduled the child shards, attempt to set their preferred AZ + // to that of the pageserver they've been attached on. + let preferred_azs = { + let locked = self.inner.read().unwrap(); + child_locations + .iter() + .filter_map(|(tid, node_id, _stripe_size)| { + let az_id = locked + .nodes + .get(node_id) + .map(|n| n.get_availability_zone_id().to_string())?; + + Some((*tid, az_id)) + }) + .collect::>() + }; + + let updated = self + .persistence + .set_tenant_shard_preferred_azs(preferred_azs) + .await + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "Failed to persist preferred az ids: {err}" + )) + }); + + match updated { + Ok(updated) => { + let mut locked = self.inner.write().unwrap(); + for (tid, az_id) in updated { + if let Some(shard) = locked.tenants.get_mut(&tid) { + shard.set_preferred_az(az_id); + } + } + } + Err(err) => { + tracing::warn!("Failed to persist preferred AZs after split: {err}"); + } + } + // Send compute notifications for all the new shards let mut failed_notifications = Vec::new(); for (child_id, child_ps, stripe_size) in child_locations { @@ -4810,6 +4976,45 @@ impl Service { )) } + pub(crate) async fn get_node_shards( + &self, + node_id: NodeId, + ) -> Result { + let locked = self.inner.read().unwrap(); + let mut shards = Vec::new(); + for (tid, tenant) in locked.tenants.iter() { + let is_intended_secondary = match ( + tenant.intent.get_attached() == &Some(node_id), + tenant.intent.get_secondary().contains(&node_id), + ) { + (true, true) => { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "{} attached as primary+secondary on the same node", + tid + ))) + } + (true, false) => Some(false), + (false, true) => Some(true), + (false, false) => None, + }; + let is_observed_secondary = if let Some(ObservedStateLocation { conf: Some(conf) }) = + tenant.observed.locations.get(&node_id) + { + Some(conf.secondary_conf.is_some()) + } else { + None + }; + if is_intended_secondary.is_some() || is_observed_secondary.is_some() { + shards.push(NodeShard { + tenant_shard_id: *tid, + is_intended_secondary, + is_observed_secondary, + }); + } + } + Ok(NodeShardResponse { node_id, shards }) + } + pub(crate) async fn get_leader(&self) -> DatabaseResult> { self.persistence.get_leader().await } @@ -4825,15 +5030,8 @@ impl Service { ) .await; - if register_req.availability_zone_id.is_none() { - tracing::warn!( - "Node {} registering without specific availability zone id", - register_req.node_id - ); - } - enum RegistrationStatus { - Matched(Node), + Matched, Mismatched, New, } @@ -4842,7 +5040,7 @@ impl Service { let locked = self.inner.read().unwrap(); if let Some(node) = locked.nodes.get(®ister_req.node_id) { if node.registration_match(®ister_req) { - RegistrationStatus::Matched(node.clone()) + RegistrationStatus::Matched } else { RegistrationStatus::Mismatched } @@ -4852,41 +5050,12 @@ impl Service { }; match registration_status { - RegistrationStatus::Matched(node) => { + RegistrationStatus::Matched => { tracing::info!( "Node {} re-registered with matching address", register_req.node_id ); - if node.get_availability_zone_id().is_none() { - if let Some(az_id) = register_req.availability_zone_id.clone() { - tracing::info!("Extracting availability zone id from registration request for node {}: {}", - register_req.node_id, az_id); - - // Persist to the database and update in memory state. See comment below - // on ordering. - self.persistence - .set_node_availability_zone_id(register_req.node_id, az_id) - .await?; - let node_with_az = Node::new( - register_req.node_id, - register_req.listen_http_addr, - register_req.listen_http_port, - register_req.listen_pg_addr, - register_req.listen_pg_port, - register_req.availability_zone_id, - ); - - let mut locked = self.inner.write().unwrap(); - let mut new_nodes = (*locked.nodes).clone(); - - locked.scheduler.node_upsert(&node_with_az); - new_nodes.insert(register_req.node_id, node_with_az); - - locked.nodes = Arc::new(new_nodes); - } - } - return Ok(()); } RegistrationStatus::Mismatched => { @@ -6533,4 +6702,35 @@ impl Service { ) -> Result<(), DatabaseError> { self.persistence.safekeeper_upsert(record).await } + + pub(crate) async fn update_shards_preferred_azs( + &self, + req: ShardsPreferredAzsRequest, + ) -> Result { + let preferred_azs = req.preferred_az_ids.into_iter().collect::>(); + let updated = self + .persistence + .set_tenant_shard_preferred_azs(preferred_azs) + .await + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "Failed to persist preferred AZs: {err}" + )) + })?; + + let mut updated_in_mem_and_db = Vec::default(); + + let mut locked = self.inner.write().unwrap(); + for (tid, az_id) in updated { + let shard = locked.tenants.get_mut(&tid); + if let Some(shard) = shard { + shard.set_preferred_az(az_id); + updated_in_mem_and_db.push(tid); + } + } + + Ok(ShardsPreferredAzsResponse { + updated: updated_in_mem_and_db, + }) + } } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 30723a3b36..cdb0633e2b 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -140,6 +140,10 @@ pub(crate) struct TenantShard { // Support/debug tool: if something is going wrong or flapping with scheduling, this may // be set to a non-active state to avoid making changes while the issue is fixed. scheduling_policy: ShardSchedulingPolicy, + + // We should attempt to schedule this shard in the provided AZ to + // decrease chances of cross-AZ compute. + preferred_az_id: Option, } #[derive(Default, Clone, Debug, Serialize)] @@ -463,6 +467,7 @@ impl TenantShard { last_error: Arc::default(), pending_compute_notification: false, scheduling_policy: ShardSchedulingPolicy::default(), + preferred_az_id: None, } } @@ -1297,6 +1302,7 @@ impl TenantShard { pending_compute_notification: false, delayed_reconcile: false, scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), + preferred_az_id: tsp.preferred_az_id, }) } @@ -1312,8 +1318,17 @@ impl TenantShard { config: serde_json::to_string(&self.config).unwrap(), splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(), + preferred_az_id: self.preferred_az_id.clone(), } } + + pub(crate) fn preferred_az(&self) -> Option<&str> { + self.preferred_az_id.as_deref() + } + + pub(crate) fn set_preferred_az(&mut self, preferred_az_id: String) { + self.preferred_az_id = Some(preferred_az_id); + } } #[cfg(test)] diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index 3e22960f8d..d53611ed6e 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -1,6 +1,7 @@ -//! Functionality for finding and purging garbage, as in "garbage collection". Garbage means -//! S3 objects which are either not referenced by any metadata, or are referenced by a -//! control plane tenant/timeline in a deleted state. +//! Functionality for finding and purging garbage, as in "garbage collection". +//! +//! Garbage means S3 objects which are either not referenced by any metadata, +//! or are referenced by a control plane tenant/timeline in a deleted state. use std::{ collections::{HashMap, HashSet}, diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs index 10d77937f1..f896cff2d5 100644 --- a/storage_scrubber/src/metadata_stream.rs +++ b/storage_scrubber/src/metadata_stream.rs @@ -74,7 +74,9 @@ pub async fn stream_tenant_shards<'a>( } /// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered -/// using a listing. The listing is done before the stream is built, so that this +/// using a listing. +/// +/// The listing is done before the stream is built, so that this /// function can be used to generate concurrency on a stream using buffer_unordered. pub async fn stream_tenant_timelines<'a>( remote_client: &'a GenericRemoteStorage, diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index 88681e38c2..c96d9cad3b 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -440,9 +440,10 @@ async fn gc_ancestor( Ok(()) } -/// Physical garbage collection: removing unused S3 objects. This is distinct from the garbage collection -/// done inside the pageserver, which operates at a higher level (keys, layers). This type of garbage collection -/// is about removing: +/// Physical garbage collection: removing unused S3 objects. +/// +/// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level +/// (keys, layers). This type of garbage collection is about removing: /// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between /// uploading a layer and uploading an index) /// - Index objects from historic generations diff --git a/test_runner/README.md b/test_runner/README.md index 73aa29d4bb..d754e60d17 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -18,8 +18,7 @@ Prerequisites: Regression tests are in the 'regress' directory. They can be run in parallel to minimize total runtime. Most regression test sets up their -environment with its own pageservers and safekeepers (but see -`TEST_SHARED_FIXTURES`). +environment with its own pageservers and safekeepers. 'pg_clients' contains tests for connecting with various client libraries. Each client test uses a Dockerfile that pulls an image that @@ -74,7 +73,6 @@ This is used to construct full path to the postgres binaries. Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16` `TEST_OUTPUT`: Set the directory where test state and test output files should go. -`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. `RUST_LOG`: logging configuration to pass into Neon CLI Useful parameters and commands: @@ -259,11 +257,8 @@ compute Postgres nodes. The connections between them can be configured to use JW authentication tokens, and some other configuration options can be tweaked too. The easiest way to get access to a Neon Environment is by using the `neon_simple_env` -fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes -or make other destructive changes in that environment. Also don't assume that -there are no tenants or branches or data in the cluster. For convenience, there is a -branch called `empty`, though. The convention is to create a test-specific branch of -that and load any test data there, instead of the 'main' branch. +fixture. For convenience, there is a branch called `main` in environments created with +'neon_simple_env', ready to be used in the test. For more complicated cases, you can build a custom Neon Environment, with the `neon_env` fixture: diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 8eda19d1e2..064a678c96 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -140,6 +140,14 @@ class TenantId(Id): return self.id.hex() +class NodeId(Id): + def __repr__(self) -> str: + return f'`NodeId("{self.id.hex()}")' + + def __str__(self) -> str: + return self.id.hex() + + class TimelineId(Id): def __repr__(self) -> str: return f'TimelineId("{self.id.hex()}")' diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 2df45a7e0e..60887b9aed 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -57,12 +57,11 @@ from _pytest.fixtures import FixtureRequest from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import cursor as PgCursor from psycopg2.extensions import make_dsn, parse_dsn -from typing_extensions import Literal from urllib3.util.retry import Retry from fixtures import overlayfs from fixtures.broker import NeonBroker -from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics @@ -221,33 +220,6 @@ def neon_api(neon_api_key: str, neon_api_base_url: str) -> NeonAPI: return NeonAPI(neon_api_key, neon_api_base_url) -def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]: - """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar. - - This function can be used as a scope like this: - @pytest.fixture(scope=shareable_scope) - def myfixture(...) - ... - """ - scope: Literal["session", "function"] - - if os.environ.get("TEST_SHARED_FIXTURES") is None: - # Create the environment in the per-test output directory - scope = "function" - elif ( - os.environ.get("BUILD_TYPE") is not None - and os.environ.get("DEFAULT_PG_VERSION") is not None - ): - scope = "session" - else: - pytest.fail( - "Shared environment(TEST_SHARED_FIXTURES) requires BUILD_TYPE and DEFAULT_PG_VERSION to be set", - pytrace=False, - ) - - return scope - - @pytest.fixture(scope="session") def worker_port_num(): return (32768 - BASE_PORT) // int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", "1")) @@ -758,6 +730,9 @@ class NeonEnvBuilder: patch_script = "" for ps in self.env.pageservers: patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';" + # This is a temporary to get the backward compat test happy + # since the compat snapshot was generated with an older version of neon local + patch_script += f"UPDATE nodes SET availability_zone_id='{ps.az_id}' WHERE node_id = '{ps.id}' AND availability_zone_id IS NULL;" patch_script_path.write_text(patch_script) # Update the config with info about tenants and timelines @@ -1428,8 +1403,8 @@ class NeonEnv: return "ep-" + str(self.endpoint_counter) -@pytest.fixture(scope=shareable_scope) -def _shared_simple_env( +@pytest.fixture(scope="function") +def neon_simple_env( request: FixtureRequest, pytestconfig: Config, port_distributor: PortDistributor, @@ -1447,19 +1422,13 @@ def _shared_simple_env( pageserver_io_buffer_alignment: Optional[int], ) -> Iterator[NeonEnv]: """ - # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES - is set, this is shared by all tests using `neon_simple_env`. + Simple Neon environment, with no authentication and no safekeepers. This fixture will use RemoteStorageKind.LOCAL_FS with pageserver. """ - if os.environ.get("TEST_SHARED_FIXTURES") is None: - # Create the environment in the per-test output directory - repo_dir = get_test_repo_dir(request, top_output_dir) - else: - # We're running shared fixtures. Share a single directory. - repo_dir = top_output_dir / "shared_repo" - shutil.rmtree(repo_dir, ignore_errors=True) + # Create the environment in the per-test output directory + repo_dir = get_test_repo_dir(request, top_output_dir) with NeonEnvBuilder( top_output_dir=top_output_dir, @@ -1481,27 +1450,9 @@ def _shared_simple_env( ) as builder: env = builder.init_start() - # For convenience in tests, create a branch from the freshly-initialized cluster. - env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) - yield env -@pytest.fixture(scope="function") -def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: - """ - Simple Neon environment, with no authentication and no safekeepers. - - If TEST_SHARED_FIXTURES environment variable is set, we reuse the same - environment for all tests that use 'neon_simple_env', keeping the - page server and safekeepers running. Any compute nodes are stopped after - each the test, however. - """ - yield _shared_simple_env - - _shared_simple_env.endpoints.stop_all() - - @pytest.fixture(scope="function") def neon_env_builder( pytestconfig: Config, @@ -2557,7 +2508,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def tenant_describe(self, tenant_id: TenantId): """ - :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int} + :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int, preferred_az_id: str} """ response = self.request( "GET", @@ -2567,6 +2518,30 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() return response.json() + def nodes(self): + """ + :return: list of {"id": ""} + """ + response = self.request( + "GET", + f"{self.api}/control/v1/node", + headers=self.headers(TokenScope.ADMIN), + ) + response.raise_for_status() + return response.json() + + def node_shards(self, node_id: NodeId): + """ + :return: list of {"shard_id": "", "is_secondary": bool} + """ + response = self.request( + "GET", + f"{self.api}/control/v1/node/{node_id}/shards", + headers=self.headers(TokenScope.ADMIN), + ) + response.raise_for_status() + return response.json() + def tenant_shard_split( self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None ) -> list[TenantShardId]: @@ -2883,6 +2858,17 @@ class NeonStorageController(MetricsGetter, LogUtils): return None raise e + def set_preferred_azs(self, preferred_azs: dict[TenantShardId, str]) -> list[TenantShardId]: + response = self.request( + "PUT", + f"{self.api}/control/v1/preferred_azs", + headers=self.headers(TokenScope.ADMIN), + json={str(tid): az for tid, az in preferred_azs.items()}, + ) + + response.raise_for_status() + return [TenantShardId.parse(tid) for tid in response.json()["updated"]] + def __enter__(self) -> "NeonStorageController": return self @@ -4860,14 +4846,7 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] # This is autouse, so the test output directory always gets created, even -# if a test doesn't put anything there. It also solves a problem with the -# neon_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it -# creates the repo in the test output directory. But it cannot depend on -# 'test_output_dir' fixture, because when TEST_SHARED_FIXTURES is not set, -# it has 'session' scope and cannot access fixtures with 'function' -# scope. So it uses the get_test_output_dir() function to get the path, and -# this fixture ensures that the directory exists. That works because -# 'autouse' fixtures are run before other fixtures. +# if a test doesn't put anything there. # # NB: we request the overlay dir fixture so the fixture does its cleanups @pytest.fixture(scope="function", autouse=True) diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 9bf03554e7..96c84d1616 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -50,6 +50,19 @@ class SafekeeperMetrics(Metrics): ).value +@dataclass +class TermBumpResponse: + previous_term: int + current_term: int + + @classmethod + def from_json(cls, d: Dict[str, Any]) -> "TermBumpResponse": + return TermBumpResponse( + previous_term=d["previous_term"], + current_term=d["current_term"], + ) + + class SafekeeperHttpClient(requests.Session, MetricsGetter): HTTPError = requests.HTTPError @@ -252,6 +265,22 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): res.raise_for_status() return res.json() + def term_bump( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + term: Optional[int], + ) -> TermBumpResponse: + body = {} + if term is not None: + body["term"] = term + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/term_bump", + json=body, + ) + res.raise_for_status() + return TermBumpResponse.from_json(res.json()) + def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): res = self.post( f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 077f73ac06..29a0380524 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -22,10 +22,8 @@ if TYPE_CHECKING: def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg): env = neon_simple_env - env.neon_cli.create_branch("test_logical_replication", "empty") - endpoint = env.endpoints.create_start("test_logical_replication") + endpoint = env.endpoints.create_start("main") - log.info("postgres is running on 'test_logical_replication' branch") pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()]) endpoint.safe_psql("create publication pub1 for table pgbench_accounts, pgbench_history") diff --git a/test_runner/regress/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py index 170b494884..13c080ea0e 100644 --- a/test_runner/regress/test_basebackup_error.py +++ b/test_runner/regress/test_basebackup_error.py @@ -8,11 +8,10 @@ from fixtures.neon_fixtures import NeonEnv # def test_basebackup_error(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_basebackup_error", "empty") pageserver_http = env.pageserver.http_client() # Introduce failpoint pageserver_http.configure_failpoints(("basebackup-before-control-file", "return")) with pytest.raises(Exception, match="basebackup-before-control-file"): - env.endpoints.create_start("test_basebackup_error") + env.endpoints.create_start("main") diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py index 26e6e336b9..6e4880841a 100644 --- a/test_runner/regress/test_clog_truncate.py +++ b/test_runner/regress/test_clog_truncate.py @@ -11,7 +11,6 @@ from fixtures.utils import query_scalar # def test_clog_truncate(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_clog_truncate", "empty") # set aggressive autovacuum to make sure that truncation will happen config = [ @@ -24,7 +23,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv): "autovacuum_freeze_max_age=100000", ] - endpoint = env.endpoints.create_start("test_clog_truncate", config_lines=config) + endpoint = env.endpoints.create_start("main", config_lines=config) # Install extension containing function needed for test endpoint.safe_psql("CREATE EXTENSION neon_test_utils") @@ -58,7 +57,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv): # create new branch after clog truncation and start a compute node on it log.info(f"create branch at lsn_after_truncation {lsn_after_truncation}") env.neon_cli.create_branch( - "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation + "test_clog_truncate_new", "main", ancestor_start_lsn=lsn_after_truncation ) endpoint2 = env.endpoints.create_start("test_clog_truncate_new") diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index dd36190fcd..8b8c970357 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -4,9 +4,8 @@ from fixtures.neon_fixtures import NeonEnv def test_compute_catalog(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_config", "empty") - endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"]) + endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"]) client = endpoint.http_client() objects = client.dbs_and_roles() diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py index 2ef28eb94b..d8ef0b8dbd 100644 --- a/test_runner/regress/test_config.py +++ b/test_runner/regress/test_config.py @@ -9,10 +9,9 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder # def test_config(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_config", "empty") # change config - endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"]) + endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"]) with closing(endpoint.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py index f741a9fc87..af643f45d7 100644 --- a/test_runner/regress/test_createdropdb.py +++ b/test_runner/regress/test_createdropdb.py @@ -17,9 +17,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str): if env.pg_version == PgVersion.V14 and strategy == "wal_log": pytest.skip("wal_log strategy not supported on PostgreSQL 14") - env.neon_cli.create_branch("test_createdb", "empty") - - endpoint = env.endpoints.create_start("test_createdb") + endpoint = env.endpoints.create_start("main") with endpoint.cursor() as cur: # Cause a 'relmapper' change in the original branch @@ -33,7 +31,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str): lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create a branch - env.neon_cli.create_branch("test_createdb2", "test_createdb", ancestor_start_lsn=lsn) + env.neon_cli.create_branch("test_createdb2", "main", ancestor_start_lsn=lsn) endpoint2 = env.endpoints.create_start("test_createdb2") # Test that you can connect to the new database on both branches @@ -62,8 +60,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str): # def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env - env.neon_cli.create_branch("test_dropdb", "empty") - endpoint = env.endpoints.create_start("test_dropdb") + endpoint = env.endpoints.create_start("main") with endpoint.cursor() as cur: cur.execute("CREATE DATABASE foodb") @@ -80,14 +77,10 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): lsn_after_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create two branches before and after database drop. - env.neon_cli.create_branch( - "test_before_dropdb", "test_dropdb", ancestor_start_lsn=lsn_before_drop - ) + env.neon_cli.create_branch("test_before_dropdb", "main", ancestor_start_lsn=lsn_before_drop) endpoint_before = env.endpoints.create_start("test_before_dropdb") - env.neon_cli.create_branch( - "test_after_dropdb", "test_dropdb", ancestor_start_lsn=lsn_after_drop - ) + env.neon_cli.create_branch("test_after_dropdb", "main", ancestor_start_lsn=lsn_after_drop) endpoint_after = env.endpoints.create_start("test_after_dropdb") # Test that database exists on the branch before drop diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py index 17d9824f52..d6f138e126 100644 --- a/test_runner/regress/test_createuser.py +++ b/test_runner/regress/test_createuser.py @@ -7,8 +7,7 @@ from fixtures.utils import query_scalar # def test_createuser(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_createuser", "empty") - endpoint = env.endpoints.create_start("test_createuser") + endpoint = env.endpoints.create_start("main") with endpoint.cursor() as cur: # Cause a 'relmapper' change in the original branch @@ -19,7 +18,7 @@ def test_createuser(neon_simple_env: NeonEnv): lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create a branch - env.neon_cli.create_branch("test_createuser2", "test_createuser", ancestor_start_lsn=lsn) + env.neon_cli.create_branch("test_createuser2", "main", ancestor_start_lsn=lsn) endpoint2 = env.endpoints.create_start("test_createuser2") # Test that you can connect to new branch as a new user diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index 50da673d87..65f310c27a 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -290,9 +290,8 @@ def assert_db_connlimit(endpoint: Any, db_name: str, connlimit: int, msg: str): # Here we test the latter. The first one is tested in test_ddl_forwarding def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_ddl_forwarding_invalid_db", "empty") endpoint = env.endpoints.create_start( - "test_ddl_forwarding_invalid_db", + "main", # Some non-existent url config_lines=["neon.console_url=http://localhost:9999/unknown/api/v0/roles_and_databases"], ) diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py index 5231dedcda..0217c9ac7b 100644 --- a/test_runner/regress/test_explain_with_lfc_stats.py +++ b/test_runner/regress/test_explain_with_lfc_stats.py @@ -10,11 +10,9 @@ def test_explain_with_lfc_stats(neon_simple_env: NeonEnv): cache_dir = Path(env.repo_dir) / "file_cache" cache_dir.mkdir(exist_ok=True) - branchname = "test_explain_with_lfc_stats" - env.neon_cli.create_branch(branchname, "empty") - log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}") + log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC") endpoint = env.endpoints.create_start( - branchname, + "main", config_lines=[ "shared_buffers='1MB'", f"neon.file_cache_path='{cache_dir}/file.cache'", diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 1b2c7f808f..cb0b30d9c6 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -16,9 +16,8 @@ from fixtures.neon_fixtures import NeonEnv, PgBin @pytest.mark.timeout(600) def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): env = neon_simple_env - env.neon_cli.create_branch("test_lfc_resize", "empty") endpoint = env.endpoints.create_start( - "test_lfc_resize", + "main", config_lines=[ "neon.file_cache_path='file.cache'", "neon.max_file_cache_size=512MB", diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index 4c53e4e2fd..4a3a949d1a 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -12,11 +12,9 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): cache_dir = Path(env.repo_dir) / "file_cache" cache_dir.mkdir(exist_ok=True) - branchname = "test_approximate_working_set_size" - env.neon_cli.create_branch(branchname, "empty") - log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}") + log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC") endpoint = env.endpoints.create_start( - branchname, + "main", config_lines=[ "shared_buffers='1MB'", f"neon.file_cache_path='{cache_dir}/file.cache'", diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index 3c404c3b23..9c38200937 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -5,7 +5,7 @@ import threading import time from typing import List -from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.utils import query_scalar @@ -15,11 +15,8 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder): cache_dir = os.path.join(env.repo_dir, "file_cache") os.mkdir(cache_dir) - env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) - env.neon_cli.create_branch("test_local_file_cache_unlink", "empty") - endpoint = env.endpoints.create_start( - "test_local_file_cache_unlink", + "main", config_lines=[ "shared_buffers='1MB'", f"neon.file_cache_path='{cache_dir}/file.cache'", diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index f83a833dda..15a3719e0b 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -36,10 +36,8 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_logical_replication", "empty") - endpoint = env.endpoints.create_start( - "test_logical_replication", config_lines=["log_statement=all"] - ) + timeline_id = env.initial_timeline + endpoint = env.endpoints.create_start("main", config_lines=["log_statement=all"]) pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -185,10 +183,9 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env - env.neon_cli.create_branch("test_logical_replication", "empty") # set low neon.logical_replication_max_snap_files endpoint = env.endpoints.create_start( - "test_logical_replication", + "main", config_lines=["log_statement=all", "neon.logical_replication_max_snap_files=1"], ) @@ -472,7 +469,7 @@ def test_slots_and_branching(neon_simple_env: NeonEnv): def test_replication_shutdown(neon_simple_env: NeonEnv): # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed env = neon_simple_env - env.neon_cli.create_branch("test_replication_shutdown_publisher", "empty") + env.neon_cli.create_branch("test_replication_shutdown_publisher", "main") pub = env.endpoints.create("test_replication_shutdown_publisher") env.neon_cli.create_branch("test_replication_shutdown_subscriber") diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py index bdc5ca907e..e88e56d030 100644 --- a/test_runner/regress/test_migrations.py +++ b/test_runner/regress/test_migrations.py @@ -9,9 +9,8 @@ if TYPE_CHECKING: def test_migrations(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_migrations", "empty") - endpoint = env.endpoints.create("test_migrations") + endpoint = env.endpoints.create("main") endpoint.respec(skip_pg_catalog_updates=False) endpoint.start() diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py index 88f7a5db59..8a00f8835f 100644 --- a/test_runner/regress/test_multixact.py +++ b/test_runner/regress/test_multixact.py @@ -14,8 +14,7 @@ from fixtures.utils import query_scalar # def test_multixact(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env - env.neon_cli.create_branch("test_multixact", "empty") - endpoint = env.endpoints.create_start("test_multixact") + endpoint = env.endpoints.create_start("main") cur = endpoint.connect().cursor() cur.execute( @@ -73,7 +72,9 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): assert int(next_multixact_id) > int(next_multixact_id_old) # Branch at this point - env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn) + env.neon_cli.create_branch( + "test_multixact_new", ancestor_branch_name="main", ancestor_start_lsn=lsn + ) endpoint_new = env.endpoints.create_start("test_multixact_new") next_multixact_id_new = endpoint_new.safe_psql( diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py index fd31df84da..7825ec772c 100644 --- a/test_runner/regress/test_neon_superuser.py +++ b/test_runner/regress/test_neon_superuser.py @@ -6,7 +6,7 @@ from fixtures.utils import wait_until def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): env = neon_simple_env - env.neon_cli.create_branch("test_neon_superuser_publisher", "empty") + env.neon_cli.create_branch("test_neon_superuser_publisher", "main") pub = env.endpoints.create("test_neon_superuser_publisher") env.neon_cli.create_branch("test_neon_superuser_subscriber") diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py index b33e387a66..a5037e8694 100644 --- a/test_runner/regress/test_parallel_copy.py +++ b/test_runner/regress/test_parallel_copy.py @@ -41,8 +41,7 @@ async def parallel_load_same_table(endpoint: Endpoint, n_parallel: int): # Load data into one table with COPY TO from 5 parallel connections def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): env = neon_simple_env - env.neon_cli.create_branch("test_parallel_copy", "empty") - endpoint = env.endpoints.create_start("test_parallel_copy") + endpoint = env.endpoints.create_start("main") # Create test table conn = endpoint.connect() diff --git a/test_runner/regress/test_pg_query_cancellation.py b/test_runner/regress/test_pg_query_cancellation.py index bad2e5865e..c6b4eff516 100644 --- a/test_runner/regress/test_pg_query_cancellation.py +++ b/test_runner/regress/test_pg_query_cancellation.py @@ -42,11 +42,9 @@ def test_cancellations(neon_simple_env: NeonEnv): ps_http = ps.http_client() ps_http.is_testing_enabled_or_skip() - env.neon_cli.create_branch("test_config", "empty") - # We don't want to have any racy behaviour with autovacuum IOs ep = env.endpoints.create_start( - "test_config", + "main", config_lines=[ "autovacuum = off", "shared_buffers = 128MB", diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py index 8e80efd9ba..1990d69b6a 100644 --- a/test_runner/regress/test_pg_waldump.py +++ b/test_runner/regress/test_pg_waldump.py @@ -22,8 +22,8 @@ def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir): def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin): env = neon_simple_env tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_pg_waldump", "empty") - endpoint = env.endpoints.create_start("test_pg_waldump") + timeline_id = env.initial_timeline + endpoint = env.endpoints.create_start("main") cur = endpoint.connect().cursor() cur.execute( diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index 1ac881553f..78798c5abf 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -15,12 +15,8 @@ extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"] # def test_read_validation(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_read_validation", "empty") - - endpoint = env.endpoints.create_start( - "test_read_validation", - ) + endpoint = env.endpoints.create_start("main") with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: @@ -131,13 +127,9 @@ def test_read_validation(neon_simple_env: NeonEnv): def test_read_validation_neg(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_read_validation_neg", "empty") - env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*") - endpoint = env.endpoints.create_start( - "test_read_validation_neg", - ) + endpoint = env.endpoints.create_start("main") with closing(endpoint.connect()) as con: with con.cursor() as c: diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 368f60127e..347fc3a04d 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -22,8 +22,7 @@ from fixtures.utils import query_scalar # def test_readonly_node(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_readonly_node", "empty") - endpoint_main = env.endpoints.create_start("test_readonly_node") + endpoint_main = env.endpoints.create_start("main") env.pageserver.allowed_errors.extend( [ @@ -74,12 +73,12 @@ def test_readonly_node(neon_simple_env: NeonEnv): # Create first read-only node at the point where only 100 rows were inserted endpoint_hundred = env.endpoints.create_start( - branch_name="test_readonly_node", endpoint_id="ep-readonly_node_hundred", lsn=lsn_a + branch_name="main", endpoint_id="ep-readonly_node_hundred", lsn=lsn_a ) # And another at the point where 200100 rows were inserted endpoint_more = env.endpoints.create_start( - branch_name="test_readonly_node", endpoint_id="ep-readonly_node_more", lsn=lsn_b + branch_name="main", endpoint_id="ep-readonly_node_more", lsn=lsn_b ) # On the 'hundred' node, we should see only 100 rows @@ -100,7 +99,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): # Check creating a node at segment boundary endpoint = env.endpoints.create_start( - branch_name="test_readonly_node", + branch_name="main", endpoint_id="ep-branch_segment_boundary", lsn=Lsn("0/3000000"), ) @@ -112,7 +111,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): with pytest.raises(Exception, match="invalid basebackup lsn"): # compute node startup with invalid LSN should fail env.endpoints.create_start( - branch_name="test_readonly_node", + branch_name="main", endpoint_id="ep-readonly_node_preinitdb", lsn=Lsn("0/42"), ) @@ -218,14 +217,10 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): # Similar test, but with more data, and we force checkpoints def test_timetravel(neon_simple_env: NeonEnv): env = neon_simple_env - pageserver_http_client = env.pageserver.http_client() - env.neon_cli.create_branch("test_timetravel", "empty") - endpoint = env.endpoints.create_start("test_timetravel") - + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline client = env.pageserver.http_client() - - tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0] - timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0] + endpoint = env.endpoints.create_start("main") lsns = [] @@ -249,7 +244,7 @@ def test_timetravel(neon_simple_env: NeonEnv): wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to force a new layer file - pageserver_http_client.timeline_checkpoint(tenant_id, timeline_id) + client.timeline_checkpoint(tenant_id, timeline_id) ##### Restart pageserver env.endpoints.stop_all() @@ -258,7 +253,7 @@ def test_timetravel(neon_simple_env: NeonEnv): for i, lsn in lsns: endpoint_old = env.endpoints.create_start( - branch_name="test_timetravel", endpoint_id=f"ep-old_lsn_{i}", lsn=lsn + branch_name="main", endpoint_id=f"ep-old_lsn_{i}", lsn=lsn ) with endpoint_old.cursor() as cur: assert query_scalar(cur, f"select count(*) from testtab where iteration={i}") == 100000 diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 8da42294b0..eea05d7548 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -1552,6 +1552,12 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto literal_shard_count = 1 if shard_count is None else shard_count assert len(describe["shards"]) == literal_shard_count + nodes = env.storage_controller.nodes() + assert len(nodes) == 2 + describe1 = env.storage_controller.node_shards(nodes[0]["id"]) + describe2 = env.storage_controller.node_shards(nodes[1]["id"]) + assert len(describe1["shards"]) + len(describe2["shards"]) == literal_shard_count + # Check the data is still there: this implicitly proves that we recovered generation numbers # properly, for the timeline which was written to after a generation bump. for timeline, branch, expect_rows in [ @@ -2512,3 +2518,55 @@ def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: del d[key] return compared[0] == compared[1] + + +@run_only_on_default_postgres("this is like a 'unit test' against storcon db") +def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): + def assign_az(ps_cfg): + az = f"az-{ps_cfg['id']}" + ps_cfg["availability_zone"] = az + + neon_env_builder.pageserver_config_override = assign_az + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tids = [TenantId.generate() for _ in range(0, 3)] + for tid in tids: + env.storage_controller.tenant_create(tid) + + shards = env.storage_controller.tenant_describe(tid)["shards"] + assert len(shards) == 1 + attached_to = shards[0]["node_attached"] + expected_az = env.get_pageserver(attached_to).az_id + + assert shards[0]["preferred_az_id"] == expected_az + + updated = env.storage_controller.set_preferred_azs( + {TenantShardId(tid, 0, 0): "foo" for tid in tids} + ) + + assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids]) + + for tid in tids: + shards = env.storage_controller.tenant_describe(tid)["shards"] + assert len(shards) == 1 + assert shards[0]["preferred_az_id"] == "foo" + + # Generate a layer to avoid shard split handling on ps from tripping + # up on debug assert. + timeline_id = TimelineId.generate() + env.neon_cli.create_timeline("bar", tids[0], timeline_id) + + workload = Workload(env, tids[0], timeline_id, branch_name="bar") + workload.init() + workload.write_rows(256) + workload.validate() + + env.storage_controller.tenant_shard_split(tids[0], shard_count=2) + shards = env.storage_controller.tenant_describe(tids[0])["shards"] + assert len(shards) == 2 + for shard in shards: + attached_to = shard["node_attached"] + expected_az = env.get_pageserver(attached_to).az_id + assert shard["preferred_az_id"] == expected_az diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py index 10cb00c780..82075bd723 100644 --- a/test_runner/regress/test_subxacts.py +++ b/test_runner/regress/test_subxacts.py @@ -9,8 +9,7 @@ from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content # CLOG. def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env - env.neon_cli.create_branch("test_subxacts", "empty") - endpoint = env.endpoints.create_start("test_subxacts") + endpoint = env.endpoints.create_start("main") pg_conn = endpoint.connect() cur = pg_conn.cursor() diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 328131cd08..711fcd5016 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -68,10 +68,13 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # construct pair of branches to validate that pageserver prohibits # deletion of ancestor timelines when they have child branches - parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty") + parent_timeline_id = env.neon_cli.create_branch( + new_branch_name="test_ancestor_branch_delete_parent", ancestor_branch_name="main" + ) leaf_timeline_id = env.neon_cli.create_branch( - "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent" + new_branch_name="test_ancestor_branch_delete_branch1", + ancestor_branch_name="test_ancestor_branch_delete_parent", ) timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id) diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index 24de894687..ddfe9b911f 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -1,17 +1,32 @@ import time +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass +from typing import List, Optional +import pytest +from fixtures.log_helper import log from fixtures.neon_fixtures import ( + LogCursor, NeonEnvBuilder, + NeonPageserver, ) from fixtures.pageserver.utils import wait_timeline_detail_404 -def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("sharded", [True, False]) +def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool): + neon_env_builder.num_pageservers = 2 if sharded else 1 env = neon_env_builder.init_start( - initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"} + initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"}, + initial_tenant_shard_count=2 if sharded else None, ) - ps = env.pageserver - http = ps.http_client() + + if sharded: + http = env.storage_controller.pageserver_api() + else: + http = env.pageserver.http_client() + + pss = ManyPageservers(list(map(lambda ps: ScrollableLog(ps, None), env.pageservers))) foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant) @@ -22,9 +37,8 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder): tenant_before = http.tenant_status(env.initial_tenant) wait_for_another_gc_round() - _, offset = ps.assert_log_contains(gc_active_line) - - assert ps.log_contains(gc_skipped_line, offset) is None + pss.assert_log_contains(gc_active_line) + pss.assert_log_does_not_contain(gc_skipped_line) http.timeline_block_gc(env.initial_tenant, foo_branch) @@ -34,34 +48,78 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder): assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }" wait_for_another_gc_round() - _, offset = ps.assert_log_contains(gc_skipped_line, offset) + pss.assert_log_contains(gc_skipped_line) - ps.restart() - ps.quiesce_tenants() + pss.restart() + pss.quiesce_tenants() - _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset) + pss.assert_log_contains(init_gc_skipped) wait_for_another_gc_round() - _, offset = ps.assert_log_contains(gc_skipped_line, offset) + pss.assert_log_contains(gc_skipped_line) # deletion unblocks gc http.timeline_delete(env.initial_tenant, foo_branch) wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0) wait_for_another_gc_round() - _, offset = ps.assert_log_contains(gc_active_line, offset) + pss.assert_log_contains(gc_active_line) http.timeline_block_gc(env.initial_tenant, env.initial_timeline) wait_for_another_gc_round() - _, offset = ps.assert_log_contains(gc_skipped_line, offset) + pss.assert_log_contains(gc_skipped_line) # removing the manual block also unblocks gc http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline) wait_for_another_gc_round() - _, offset = ps.assert_log_contains(gc_active_line, offset) + pss.assert_log_contains(gc_active_line) def wait_for_another_gc_round(): time.sleep(2) + + +@dataclass +class ScrollableLog: + pageserver: NeonPageserver + offset: Optional[LogCursor] + + def assert_log_contains(self, what: str): + msg, offset = self.pageserver.assert_log_contains(what, offset=self.offset) + old = self.offset + self.offset = offset + log.info(f"{old} -> {offset}: {msg}") + + def assert_log_does_not_contain(self, what: str): + assert self.pageserver.log_contains(what) is None + + +@dataclass(frozen=True) +class ManyPageservers: + many: List[ScrollableLog] + + def assert_log_contains(self, what: str): + for one in self.many: + one.assert_log_contains(what) + + def assert_log_does_not_contain(self, what: str): + for one in self.many: + one.assert_log_does_not_contain(what) + + def restart(self): + def do_restart(x: ScrollableLog): + x.pageserver.restart() + + with ThreadPoolExecutor(max_workers=len(self.many)) as rt: + rt.map(do_restart, self.many) + rt.shutdown(wait=True) + + def quiesce_tenants(self): + def do_quiesce(x: ScrollableLog): + x.pageserver.quiesce_tenants() + + with ThreadPoolExecutor(max_workers=len(self.many)) as rt: + rt.map(do_quiesce, self.many) + rt.shutdown(wait=True) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 9bf5f8680b..f2265dd3d9 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -36,7 +36,7 @@ from fixtures.utils import get_timeline_dir_size, wait_until def test_timeline_size(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") + new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "main") client = env.pageserver.http_client() client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) @@ -68,7 +68,7 @@ def test_timeline_size(neon_simple_env: NeonEnv): def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty") + new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "main") client = env.pageserver.http_client() client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py index dd76689008..ea900b07b8 100644 --- a/test_runner/regress/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -9,10 +9,7 @@ from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn # def test_twophase(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_twophase", "empty") - endpoint = env.endpoints.create_start( - "test_twophase", config_lines=["max_prepared_transactions=5"] - ) + endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=5"]) conn = endpoint.connect() cur = conn.cursor() @@ -56,7 +53,7 @@ def test_twophase(neon_simple_env: NeonEnv): assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state - fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "test_twophase") + fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "main") # Start compute on the new branch endpoint2 = env.endpoints.create_start( diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py index 137d28b9fa..deba29536c 100644 --- a/test_runner/regress/test_unlogged.py +++ b/test_runner/regress/test_unlogged.py @@ -9,8 +9,7 @@ from fixtures.pg_version import PgVersion # def test_unlogged(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_unlogged", "empty") - endpoint = env.endpoints.create_start("test_unlogged") + endpoint = env.endpoints.create_start("main") conn = endpoint.connect() cur = conn.cursor() @@ -22,7 +21,7 @@ def test_unlogged(neon_simple_env: NeonEnv): cur.execute("INSERT INTO iut (id) values (42);") # create another compute to fetch inital empty contents from pageserver - fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged") + fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "main") endpoint2 = env.endpoints.create_start("test_unlogged_basebackup") conn2 = endpoint2.connect() diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 7272979c4a..3075211ada 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -13,8 +13,7 @@ from fixtures.utils import query_scalar def test_vm_bit_clear(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_vm_bit_clear", "empty") - endpoint = env.endpoints.create_start("test_vm_bit_clear") + endpoint = env.endpoints.create_start("main") pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -58,7 +57,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): cur.execute("UPDATE vmtest_cold_update2 SET id = 5000, filler=repeat('x', 200) WHERE id = 1") # Branch at this point, to test that later - fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "test_vm_bit_clear") + fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "main") # Clear the buffer cache, to force the VM page to be re-fetched from # the page server diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 3785651aed..50fac441c0 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1057,6 +1057,24 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder): endpoint.start() +# Try restarting endpoint immediately after xlog switch. +# https://github.com/neondatabase/neon/issues/8911 +def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + endpoint = env.endpoints.create_start("main") + + endpoint.safe_psql("create table t (i int)") + + endpoint.safe_psql("SELECT pg_switch_wal()") + + # we want immediate shutdown to have endpoint restart on xlog switch record, + # so prevent shutdown checkpoint. + endpoint.stop(mode="immediate") + endpoint = env.endpoints.create_start("main") + endpoint.safe_psql("SELECT 'works'") + + # Context manager which logs passed time on exit. class DurationLogger: def __init__(self, desc): @@ -2176,6 +2194,43 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder): assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1" +def test_term_bump(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + endpoint = env.endpoints.create_start("main") + # initialize safekeeper + endpoint.safe_psql("create table t(key int, value text)") + + http_cli = env.safekeepers[0].http_client() + + # check that bump up to specific term works + curr_term = http_cli.timeline_status(tenant_id, timeline_id).term + bump_to = curr_term + 3 + res = http_cli.term_bump(tenant_id, timeline_id, bump_to) + log.info(f"bump to {bump_to} res: {res}") + assert res.current_term >= bump_to + + # check that bump to none increments current term + res = http_cli.term_bump(tenant_id, timeline_id, None) + log.info(f"bump to None res: {res}") + assert res.current_term > bump_to + assert res.current_term > res.previous_term + + # check that bumping doesn't work downward + res = http_cli.term_bump(tenant_id, timeline_id, 2) + log.info(f"bump to 2 res: {res}") + assert res.current_term > bump_to + assert res.current_term == res.previous_term + + # check that this doesn't kill endpoint because last WAL flush was his and + # thus its basebackup is still good + endpoint.safe_psql("insert into t values (1, 'payload')") + + # Test disables periodic pushes from safekeeper to the broker and checks that # pageserver can still discover safekeepers with discovery requests. def test_broker_discovery(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py index 7e8aef5a5f..d710b53528 100644 --- a/test_runner/test_broken.py +++ b/test_runner/test_broken.py @@ -23,8 +23,7 @@ run_broken = pytest.mark.skipif( def test_broken(neon_simple_env: NeonEnv, pg_bin): env = neon_simple_env - env.neon_cli.create_branch("test_broken", "empty") - env.endpoints.create_start("test_broken") + env.endpoints.create_start("main") log.info("postgres is running") log.info("THIS NEXT COMMAND WILL FAIL:") diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 20693ad63d..140c43639e 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -8,6 +8,8 @@ version = "0.1.0" description = "workspace-hack package, managed by hakari" # You can choose to publish this crate: see https://docs.rs/cargo-hakari/latest/cargo_hakari/publishing. publish = false +edition.workspace = true +license.workspace = true # The parts of the file between the BEGIN HAKARI SECTION and END HAKARI SECTION comments # are managed by hakari. @@ -47,7 +49,8 @@ hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } indexmap = { version = "1", default-features = false, features = ["std"] } -itertools = { version = "0.10" } +itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] } +itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -57,7 +60,7 @@ num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } -parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } +parquet = { version = "53", default-features = false, features = ["zstd"] } prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } @@ -80,6 +83,7 @@ time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-rustls = { version = "0.24" } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } +toml_edit = { version = "0.22", features = ["serde"] } tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } @@ -101,7 +105,8 @@ either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } hashbrown = { version = "0.14", features = ["raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } -itertools = { version = "0.10" } +itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] } +itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -111,7 +116,7 @@ num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } -parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } +parquet = { version = "53", default-features = false, features = ["zstd"] } proc-macro2 = { version = "1" } prost = { version = "0.11" } quote = { version = "1" } @@ -122,6 +127,7 @@ serde = { version = "1", features = ["alloc", "derive"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] } syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] } +toml_edit = { version = "0.22", features = ["serde"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }