diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index 7f7fa9e7a1..54b69d6d48 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -57,14 +57,14 @@ runs: if ! which allure; then ALLURE_ZIP=allure-${ALLURE_VERSION}.zip wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP} - echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c + echo "${ALLURE_ZIP_SHA256} ${ALLURE_ZIP}" | sha256sum --check unzip -q ${ALLURE_ZIP} echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH rm -f ${ALLURE_ZIP} fi env: - ALLURE_VERSION: 2.22.0 - ALLURE_ZIP_MD5: d5c9f0989b896482536956340a7d5ec9 + ALLURE_VERSION: 2.22.1 + ALLURE_ZIP_SHA256: fdc7a62d94b14c5e0bf25198ae1feded6b005fdbed864b4d3cb4e5e901720b0b # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this - name: Acquire lock diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 4493985587..dec1f47e47 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -36,14 +36,6 @@ inputs: description: 'Region name for real s3 tests' required: false default: '' - real_s3_access_key_id: - description: 'Access key id' - required: false - default: '' - real_s3_secret_access_key: - description: 'Secret access key' - required: false - default: '' rerun_flaky: description: 'Whether to rerun flaky tests' required: false @@ -104,8 +96,6 @@ runs: COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install TEST_OUTPUT: /tmp/test_output BUILD_TYPE: ${{ inputs.build_type }} - AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }} - AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }} COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage') ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 564251ef8f..897e1a7aad 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -264,7 +264,7 @@ jobs: export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev export REMOTE_STORAGE_S3_REGION=eu-central-1 # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact + ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3 - name: Install rust binaries run: | @@ -346,10 +346,8 @@ jobs: test_selection: regress needs_postgres_source: true run_with_real_s3: true - real_s3_bucket: ci-tests-s3 - real_s3_region: us-west-2 - real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}" - real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}" + real_s3_bucket: neon-github-ci-tests + real_s3_region: eu-central-1 rerun_flaky: true pg_version: ${{ matrix.pg_version }} env: @@ -409,9 +407,7 @@ jobs: uses: ./.github/actions/allure-report-generate - uses: actions/github-script@v6 - if: > - !cancelled() && - github.event_name == 'pull_request' + if: ${{ !cancelled() }} with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 @@ -421,7 +417,7 @@ jobs: reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}", } - const script = require("./scripts/pr-comment-test-report.js") + const script = require("./scripts/comment-test-report.js") await script({ github, context, @@ -496,19 +492,24 @@ jobs: env: COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }} run: | - scripts/coverage \ - --dir=/tmp/coverage report \ + scripts/coverage --dir=/tmp/coverage \ + report \ --input-objects=/tmp/coverage/binaries.list \ --commit-url=${COMMIT_URL} \ --format=github + scripts/coverage --dir=/tmp/coverage \ + report \ + --input-objects=/tmp/coverage/binaries.list \ + --format=lcov + - name: Upload coverage report id: upload-coverage-report env: BUCKET: neon-github-public-dev COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} run: | - aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://neon-github-public-dev/code-coverage/${COMMIT_SHA} + aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA} REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT @@ -663,6 +664,9 @@ jobs: project: nrdv0s4kcs push: true tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}} + build-args: | + GIT_VERSION=${{ github.sha }} + REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com compute-tools-image: runs-on: [ self-hosted, gen3, large ] @@ -777,7 +781,7 @@ jobs: run: shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.4.6 + VM_BUILDER_VERSION: v0.8.0 steps: - name: Checkout @@ -787,21 +791,18 @@ jobs: - name: Downloading vm-builder run: | - curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder + curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder chmod +x vm-builder + # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and + # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - - name: Building VM compute-node rootfs - run: | - docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node . - - name: Build vm image run: | - # note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images - ./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + ./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - name: Pushing vm-compute-node image run: | diff --git a/Cargo.lock b/Cargo.lock index 4d63ebd99d..6856b9e3ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,17 +17,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" -[[package]] -name = "ahash" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" -dependencies = [ - "getrandom", - "once_cell", - "version_check", -] - [[package]] name = "ahash" version = "0.8.3" @@ -41,9 +30,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "0.7.20" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04" dependencies = [ "memchr", ] @@ -65,9 +54,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.3.0" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e579a7752471abc2a8268df8b20005e3eadd975f585398f17efcfd8d4927371" +checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" dependencies = [ "anstyle", "anstyle-parse", @@ -104,9 +93,9 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcd8291a340dd8ac70e18878bc4501dd7b4ff970cfa21c207d36ece51ea88fd" +checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" dependencies = [ "anstyle", "windows-sys 0.48.0", @@ -114,9 +103,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.70" +version = "1.0.71" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7de8ce5e0f9f8d88245311066a578d72b7af3e7088f32783804676302df237e4" +checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8" dependencies = [ "backtrace", ] @@ -188,7 +177,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -199,7 +188,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -230,9 +219,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc00553f5f3c06ffd4510a9d576f92143618706c45ea6ff81e84ad9be9588abd" +checksum = "bcdcf0d683fe9c23d32cf5b53c9918ea0a500375a9fb20109802552658e576c9" dependencies = [ "aws-credential-types", "aws-http", @@ -256,9 +245,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cb57ac6088805821f78d282c0ba8aec809f11cbee10dda19a97b03ab040ccc2" +checksum = "1fcdb2f7acbc076ff5ad05e7864bdb191ca70a6fd07668dc3a1a8bcd051de5ae" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -270,9 +259,9 @@ dependencies = [ [[package]] name = "aws-endpoint" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c5f6f84a4f46f95a9bb71d9300b73cd67eb868bc43ae84f66ad34752299f4ac" +checksum = "8cce1c41a6cfaa726adee9ebb9a56fcd2bbfd8be49fd8a04c5e20fd968330b04" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -284,9 +273,9 @@ dependencies = [ [[package]] name = "aws-http" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a754683c322f7dc5167484266489fdebdcd04d26e53c162cad1f3f949f2c5671" +checksum = "aadbc44e7a8f3e71c8b374e03ecd972869eb91dd2bc89ed018954a52ba84bc44" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -303,9 +292,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "0.25.1" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "392b9811ca489747ac84349790e49deaa1f16631949e7dd4156000251c260eae" +checksum = "37c77060408d653d3efa6ea7b66c1389bc35a0342352984c8bf8bcb814a8fc27" dependencies = [ "aws-credential-types", "aws-endpoint", @@ -336,9 +325,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "0.27.0" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d0fbe3c2c342bc8dfea4bb43937405a8ec06f99140a0dcb9c7b59e54dfa93a1" +checksum = "265fac131fbfc188e5c3d96652ea90ecc676a934e3174eaaee523c6cec040b3b" dependencies = [ "aws-credential-types", "aws-endpoint", @@ -362,9 +351,9 @@ dependencies = [ [[package]] name = "aws-sig-auth" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84dc92a63ede3c2cbe43529cb87ffa58763520c96c6a46ca1ced80417afba845" +checksum = "3b94acb10af0c879ecd5c7bdf51cda6679a0a4f4643ce630905a77673bfa3c61" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -377,9 +366,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "392fefab9d6fcbd76d518eb3b1c040b84728ab50f58df0c3c53ada4bea9d327e" +checksum = "9d2ce6f507be68e968a33485ced670111d1cbad161ddbbab1e313c03d37d8f4c" dependencies = [ "aws-smithy-eventstream", "aws-smithy-http", @@ -398,9 +387,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae23b9fe7a07d0919000116c4c5c0578303fbce6fc8d32efca1f7759d4c20faf" +checksum = "13bda3996044c202d75b91afeb11a9afae9db9a721c6a7a427410018e286b880" dependencies = [ "futures-util", "pin-project-lite", @@ -410,9 +399,9 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6367acbd6849b8c7c659e166955531274ae147bf83ab4312885991f6b6706cb" +checksum = "07ed8b96d95402f3f6b8b57eb4e0e45ee365f78b1a924faf20ff6e97abf1eae6" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -431,9 +420,9 @@ dependencies = [ [[package]] name = "aws-smithy-client" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5230d25d244a51339273b8870f0f77874cd4449fb4f8f629b21188ae10cfc0ba" +checksum = "0a86aa6e21e86c4252ad6a0e3e74da9617295d8d6e374d552be7d3059c41cedd" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -444,7 +433,7 @@ dependencies = [ "http", "http-body", "hyper", - "hyper-rustls", + "hyper-rustls 0.23.2", "lazy_static", "pin-project-lite", "rustls 0.20.8", @@ -455,9 +444,9 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22d2a2bcc16e5c4d949ffd2b851da852b9bbed4bb364ed4ae371b42137ca06d9" +checksum = "460c8da5110835e3d9a717c61f5556b20d03c32a1dec57f8fc559b360f733bb8" dependencies = [ "aws-smithy-types", "bytes", @@ -466,9 +455,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b60e2133beb9fe6ffe0b70deca57aaeff0a35ad24a9c6fab2fd3b4f45b99fdb5" +checksum = "2b3b693869133551f135e1f2c77cb0b8277d9e3e17feaf2213f735857c4f0d28" dependencies = [ "aws-smithy-eventstream", "aws-smithy-types", @@ -489,9 +478,9 @@ dependencies = [ [[package]] name = "aws-smithy-http-tower" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a4d94f556c86a0dd916a5d7c39747157ea8cb909ca469703e20fee33e448b67" +checksum = "3ae4f6c5798a247fac98a867698197d9ac22643596dc3777f0c76b91917616b9" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -505,18 +494,18 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce3d6e6ebb00b2cce379f079ad5ec508f9bcc3a9510d9b9c1840ed1d6f8af39" +checksum = "23f9f42fbfa96d095194a632fbac19f60077748eba536eb0b9fecc28659807f8" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-query" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d58edfca32ef9bfbc1ca394599e17ea329cb52d6a07359827be74235b64b3298" +checksum = "98819eb0b04020a1c791903533b638534ae6c12e2aceda3e6e6fba015608d51d" dependencies = [ "aws-smithy-types", "urlencoding", @@ -524,9 +513,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58db46fc1f4f26be01ebdb821751b4e2482cd43aa2b64a0348fb89762defaffa" +checksum = "16a3d0bf4f324f4ef9793b86a1701d9700fbcdbd12a846da45eed104c634c6e8" dependencies = [ "base64-simd", "itoa", @@ -537,18 +526,18 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb557fe4995bd9ec87fb244bbb254666a971dc902a783e9da8b7711610e9664c" +checksum = "b1b9d12875731bd07e767be7baad95700c3137b56730ec9ddeedb52a5e5ca63b" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de0869598bfe46ec44ffe17e063ed33336e59df90356ca8ff0e8da6f7c1d994b" +checksum = "6dd209616cc8d7bfb82f87811a5c655dc97537f592689b18743bddf5dc5c4829" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -562,9 +551,9 @@ dependencies = [ [[package]] name = "axum" -version = "0.6.15" +version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b32c5ea3aabaf4deb5f5ced2d688ec0844c881c9e6c696a8b769a05fc691e62" +checksum = "f8175979259124331c1d7bf6586ee7e0da434155e4b2d48ec2c8386281d8df39" dependencies = [ "async-trait", "axum-core", @@ -634,9 +623,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5" [[package]] name = "base64" -version = "0.21.0" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" +checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105" [[package]] name = "base64-simd" @@ -670,13 +659,13 @@ dependencies = [ "lazycell", "log", "peeking_take_while", - "prettyplease 0.2.4", + "prettyplease 0.2.6", "proc-macro2", "quote", "regex", "rustc-hash", "shlex", - "syn 2.0.15", + "syn 2.0.16", "which", ] @@ -697,9 +686,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3d4260bcc2e8fc9df1eac4919a720effeb63a3f0952f5bf4944adfa18897f09" +checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5" dependencies = [ "memchr", "once_cell", @@ -709,9 +698,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.12.0" +version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" +checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" [[package]] name = "byteorder" @@ -780,9 +769,9 @@ dependencies = [ [[package]] name = "ciborium" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0c137568cc60b904a7724001b35ce2630fd00d5d84805fbb608ab89509d788f" +checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926" dependencies = [ "ciborium-io", "ciborium-ll", @@ -791,15 +780,15 @@ dependencies = [ [[package]] name = "ciborium-io" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "346de753af073cc87b52b2083a506b38ac176a44cfb05497b622e27be899b369" +checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656" [[package]] name = "ciborium-ll" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "213030a2b5a4e0c0892b6652260cf6ccac84827b83a85a534e178e3906c4cf1b" +checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" dependencies = [ "ciborium-io", "half", @@ -818,9 +807,9 @@ dependencies = [ [[package]] name = "clap" -version = "3.2.23" +version = "3.2.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5" +checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123" dependencies = [ "bitflags", "clap_lex 0.2.4", @@ -830,9 +819,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.2.2" +version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b802d85aaf3a1cdb02b224ba472ebdea62014fccfcb269b95a4d76443b5ee5a" +checksum = "93aae7a4192245f70fe75dd9157fc7b4a5bf53e88d30bd4396f7d8f9284d5acc" dependencies = [ "clap_builder", "clap_derive", @@ -841,27 +830,27 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.2.2" +version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14a1a858f532119338887a4b8e1af9c60de8249cd7bafd68036a489e261e37b6" +checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990" dependencies = [ "anstream", "anstyle", "bitflags", - "clap_lex 0.4.1", + "clap_lex 0.5.0", "strsim", ] [[package]] name = "clap_derive" -version = "4.2.0" +version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9644cd56d6b87dbe899ef8b053e331c0637664e9e21a33dfcdc36093f5c5c4" +checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -875,9 +864,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.4.1" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a2dd5a6fe8c6e3502f568a6353e5273bbb15193ad9a89e457b9970798efbea1" +checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" [[package]] name = "close_fds" @@ -889,16 +878,6 @@ dependencies = [ "libc", ] -[[package]] -name = "codespan-reporting" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" -dependencies = [ - "termcolor", - "unicode-width", -] - [[package]] name = "colorchoice" version = "1.0.0" @@ -936,7 +915,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 4.2.2", + "clap 4.3.0", "compute_api", "futures", "hyper", @@ -998,7 +977,7 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.2.2", + "clap 4.3.0", "comfy-table", "compute_api", "git-version", @@ -1041,9 +1020,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181" +checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58" dependencies = [ "libc", ] @@ -1076,7 +1055,7 @@ dependencies = [ "atty", "cast", "ciborium", - "clap 3.2.23", + "clap 3.2.25", "criterion-plot", "itertools", "lazy_static", @@ -1186,55 +1165,11 @@ dependencies = [ "typenum", ] -[[package]] -name = "cxx" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f61f1b6389c3fe1c316bf8a4dccc90a38208354b330925bce1f74a6c4756eb93" -dependencies = [ - "cc", - "cxxbridge-flags", - "cxxbridge-macro", - "link-cplusplus", -] - -[[package]] -name = "cxx-build" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12cee708e8962df2aeb38f594aae5d827c022b6460ac71a7a3e2c3c2aae5a07b" -dependencies = [ - "cc", - "codespan-reporting", - "once_cell", - "proc-macro2", - "quote", - "scratch", - "syn 2.0.15", -] - -[[package]] -name = "cxxbridge-flags" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7944172ae7e4068c533afbb984114a56c46e9ccddda550499caa222902c7f7bb" - -[[package]] -name = "cxxbridge-macro" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.15", -] - [[package]] name = "darling" -version = "0.14.4" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" +checksum = "0558d22a7b463ed0241e993f76f09f30b126687447751a8638587b864e4b3944" dependencies = [ "darling_core", "darling_macro", @@ -1242,27 +1177,27 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.14.4" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" +checksum = "ab8bfa2e259f8ee1ce5e97824a3c55ec4404a0d772ca7fa96bf19f0752a046eb" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", "strsim", - "syn 1.0.109", + "syn 2.0.16", ] [[package]] name = "darling_macro" -version = "0.14.4" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" +checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 1.0.109", + "syn 2.0.16", ] [[package]] @@ -1280,9 +1215,9 @@ dependencies = [ [[package]] name = "data-encoding" -version = "2.3.3" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23d8666cb01533c39dde32bcbab8e227b4ed6679b2c925eba05feabea39508fb" +checksum = "c2e66c9d817f1720209181c316d28635c050fa304f9c79e47a520882661b7308" [[package]] name = "debugid" @@ -1310,9 +1245,9 @@ dependencies = [ [[package]] name = "digest" -version = "0.10.6" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", @@ -1321,13 +1256,13 @@ dependencies = [ [[package]] name = "displaydoc" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bf95dc3f046b9da4f2d51833c0d3547d8564ef6910f5c1ed130306a75b92886" +checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.16", ] [[package]] @@ -1367,23 +1302,23 @@ dependencies = [ [[package]] name = "enumset" -version = "1.0.12" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19be8061a06ab6f3a6cf21106c873578bf01bd42ad15e0311a9c76161cb1c753" +checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb" dependencies = [ "enumset_derive", ] [[package]] name = "enumset_derive" -version = "0.6.1" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03e7b551eba279bf0fa88b83a46330168c1560a52a94f5126f892f0b364ab3e0" +checksum = "e08b6c6ab82d70f08844964ba10c7babb716de2ecaeab9be5717918a5177d3af" dependencies = [ "darling", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.16", ] [[package]] @@ -1569,7 +1504,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -1667,9 +1602,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.3.18" +version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f8a914c2987b688368b5138aa05321db91f4090cf26118185672ad588bce21" +checksum = "d357c7ae988e7d2182f7d7871d0b963962420b0678b0997ce7de72001aeab782" dependencies = [ "bytes", "fnv", @@ -1704,9 +1639,6 @@ name = "hashbrown" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" -dependencies = [ - "ahash 0.7.6", -] [[package]] name = "hashbrown" @@ -1714,16 +1646,16 @@ version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" dependencies = [ - "ahash 0.8.3", + "ahash", ] [[package]] name = "hashlink" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69fe1fcf8b4278d860ad0548329f892a3631fb63f82574df68275f34cdbe0ffa" +checksum = "0761a1b9491c4f2e3d66aa0f62d0fba0af9a0e2852e4d48ea506632a4b56e6aa" dependencies = [ - "hashbrown 0.12.3", + "hashbrown 0.13.2", ] [[package]] @@ -1892,6 +1824,19 @@ dependencies = [ "tokio-rustls 0.23.4", ] +[[package]] +name = "hyper-rustls" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" +dependencies = [ + "http", + "hyper", + "rustls 0.21.1", + "tokio", + "tokio-rustls 0.24.0", +] + [[package]] name = "hyper-timeout" version = "0.4.1" @@ -1933,12 +1878,11 @@ dependencies = [ [[package]] name = "iana-time-zone-haiku" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" dependencies = [ - "cxx", - "cxx-build", + "cc", ] [[package]] @@ -1999,9 +1943,9 @@ dependencies = [ [[package]] name = "io-lifetimes" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" dependencies = [ "hermit-abi 0.3.1", "libc", @@ -2022,7 +1966,7 @@ checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" dependencies = [ "hermit-abi 0.3.1", "io-lifetimes", - "rustix 0.37.11", + "rustix 0.37.19", "windows-sys 0.48.0", ] @@ -2043,9 +1987,9 @@ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" [[package]] name = "js-sys" -version = "0.3.61" +version = "0.3.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" +checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790" dependencies = [ "wasm-bindgen", ] @@ -2056,7 +2000,7 @@ version = "8.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378" dependencies = [ - "base64 0.21.0", + "base64 0.21.1", "pem", "ring", "serde", @@ -2098,9 +2042,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.141" +version = "0.2.144" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5" +checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1" [[package]] name = "libloading" @@ -2112,15 +2056,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "link-cplusplus" -version = "1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" -dependencies = [ - "cc", -] - [[package]] name = "linux-raw-sys" version = "0.1.4" @@ -2129,9 +2064,9 @@ checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" [[package]] name = "linux-raw-sys" -version = "0.3.1" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d59d8c75012853d2e872fb56bc8a2e53718e2cafe1a4c823143141c6d90c322f" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" [[package]] name = "lock_api" @@ -2316,9 +2251,9 @@ dependencies = [ [[package]] name = "notify" -version = "5.1.0" +version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58ea850aa68a06e48fdb069c0ec44d0d64c8dbffa49bf3b6f7f0a901fdea1ba9" +checksum = "729f63e1ca555a43fe3efa4f3efdf4801c479da85b432242a7b726f353c88486" dependencies = [ "bitflags", "crossbeam-channel", @@ -2329,7 +2264,7 @@ dependencies = [ "libc", "mio", "walkdir", - "windows-sys 0.42.0", + "windows-sys 0.45.0", ] [[package]] @@ -2435,7 +2370,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -2587,6 +2522,21 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" +[[package]] +name = "pagectl" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes", + "clap 4.3.0", + "git-version", + "pageserver", + "postgres_ffi", + "svg_fmt", + "utils", + "workspace_hack", +] + [[package]] name = "pageserver" version = "0.1.0" @@ -2597,7 +2547,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.2.2", + "clap 4.3.0", "close_fds", "const_format", "consumption_metrics", @@ -2753,22 +2703,22 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.0.12" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad29a609b6bcd67fee905812e544992d216af9d755757c05ed2d0e15a74c6ecc" +checksum = "c95a7476719eab1e366eaf73d0260af3021184f18177925b07f54b30089ceead" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.0.12" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55" +checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.16", ] [[package]] @@ -2785,9 +2735,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" [[package]] name = "plotters" @@ -2820,7 +2770,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" dependencies = [ "bytes", "fallible-iterator", @@ -2833,7 +2783,7 @@ dependencies = [ [[package]] name = "postgres-native-tls" version = "0.5.0" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" dependencies = [ "native-tls", "tokio", @@ -2844,7 +2794,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" dependencies = [ "base64 0.20.0", "byteorder", @@ -2862,7 +2812,7 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" dependencies = [ "bytes", "fallible-iterator", @@ -2924,7 +2874,6 @@ dependencies = [ "serde", "thiserror", "utils", - "wal_craft", "workspace_hack", ] @@ -2961,12 +2910,12 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058" +checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" dependencies = [ "proc-macro2", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -2977,9 +2926,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.56" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435" +checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8" dependencies = [ "unicode-ident", ] @@ -2994,7 +2943,7 @@ dependencies = [ "byteorder", "hex", "lazy_static", - "rustix 0.36.12", + "rustix 0.36.14", ] [[package]] @@ -3078,7 +3027,7 @@ dependencies = [ "bstr", "bytes", "chrono", - "clap 4.2.2", + "clap 4.3.0", "consumption_metrics", "futures", "git-version", @@ -3116,7 +3065,7 @@ dependencies = [ "serde", "serde_json", "sha2", - "socket2 0.5.2", + "socket2 0.5.3", "sync_wrapper", "thiserror", "tls-listener", @@ -3139,9 +3088,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.26" +version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" +checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500" dependencies = [ "proc-macro2", ] @@ -3230,13 +3179,13 @@ dependencies = [ [[package]] name = "regex" -version = "1.7.3" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d" +checksum = "d1a59b5d8e97dee33696bf13c5ba8ab85341c002922fba050069326b9c498974" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.7.2", ] [[package]] @@ -3245,7 +3194,7 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" dependencies = [ - "regex-syntax", + "regex-syntax 0.6.29", ] [[package]] @@ -3254,6 +3203,12 @@ version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" +[[package]] +name = "regex-syntax" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" + [[package]] name = "remote_storage" version = "0.1.0" @@ -3283,11 +3238,11 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.16" +version = "0.11.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27b71749df584b7f4cac2c426c127a7c785a5106cc98f7a8feb044115f0fa254" +checksum = "cde824a14b7c14f85caff81225f411faacc04a2013f41670f41443742b1c1c55" dependencies = [ - "base64 0.21.0", + "base64 0.21.1", "bytes", "encoding_rs", "futures-core", @@ -3296,7 +3251,7 @@ dependencies = [ "http", "http-body", "hyper", - "hyper-rustls", + "hyper-rustls 0.24.0", "ipnet", "js-sys", "log", @@ -3305,13 +3260,13 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls 0.20.8", + "rustls 0.21.1", "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", "tokio", - "tokio-rustls 0.23.4", + "tokio-rustls 0.24.0", "tower-service", "url", "wasm-bindgen", @@ -3323,9 +3278,9 @@ dependencies = [ [[package]] name = "reqwest-middleware" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99c50db2c7ccd815f976473dd7d0bde296f8c3b77c383acf4fc021cdcf10852b" +checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d" dependencies = [ "anyhow", "async-trait", @@ -3338,12 +3293,14 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.4.1" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a71d77945a1c5ae9604f0504901e77a1e2e71f2932b1cb8103078179ca62ff8" +checksum = "783e8130d2427ddd7897dd3f814d4a3aea31b05deb42a4fdf8c18258fe5aefd1" dependencies = [ + "anyhow", "async-trait", "getrandom", + "matchit", "opentelemetry", "reqwest", "reqwest-middleware", @@ -3417,9 +3374,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4a36c42d1873f9a77c53bde094f9664d9891bc604a45b4798fd2c389ed12e5b" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" [[package]] name = "rustc-hash" @@ -3447,9 +3404,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.12" +version = "0.36.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0af200a3324fa5bcd922e84e9b55a298ea9f431a489f01961acdebc6e908f25" +checksum = "14e4d67015953998ad0eb82887a0eb0129e18a7e2f3b7b0f6c422fddcd503d62" dependencies = [ "bitflags", "errno", @@ -3461,15 +3418,15 @@ dependencies = [ [[package]] name = "rustix" -version = "0.37.11" +version = "0.37.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85597d61f83914ddeba6a47b3b8ffe7365107221c2e557ed94426489fefb5f77" +checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", - "linux-raw-sys 0.3.1", + "linux-raw-sys 0.3.8", "windows-sys 0.48.0", ] @@ -3487,9 +3444,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.21.0" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07180898a28ed6a7f7ba2311594308f595e3dd2e3c3812fa0a80a47b45f17e5d" +checksum = "c911ba11bc8433e811ce56fde130ccf32f5127cab0e0194e9c68c5a5b671791e" dependencies = [ "log", "ring", @@ -3515,7 +3472,7 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b" dependencies = [ - "base64 0.21.0", + "base64 0.21.1", ] [[package]] @@ -3550,7 +3507,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.2.2", + "clap 4.3.0", "const_format", "crc32c", "fs2", @@ -3624,12 +3581,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" -[[package]] -name = "scratch" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1" - [[package]] name = "sct" version = "0.7.0" @@ -3642,9 +3593,9 @@ dependencies = [ [[package]] name = "security-framework" -version = "2.8.2" +version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a332be01508d814fed64bf28f798a146d73792121129962fdf335bb3c49a4254" +checksum = "1fc758eb7bffce5b308734e9b0c1468893cae9ff70ebf13e7090be8dcbcc83a8" dependencies = [ "bitflags", "core-foundation", @@ -3655,9 +3606,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.8.0" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31c9bb296072e961fcbd8853511dd39c2d8be2deb1e17c6860b1d30732b323b4" +checksum = "f51d0c0d83bec45f16480d0ce0058397a69e48fcdc52d1dc8855fb68acbd31a7" dependencies = [ "core-foundation-sys", "libc", @@ -3755,22 +3706,22 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.160" +version = "1.0.163" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb2f3770c8bce3bcda7e149193a069a0f4365bda1fa5cd88e03bca26afc1216c" +checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.160" +version = "1.0.163" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291a097c63d8497e00160b166a967a4a79c64f3facdd01cbd7502231688d77df" +checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -3786,9 +3737,9 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0efd8caf556a6cebd3b285caf480045fcc1ac04f6bd786b09a6f11af30c4fcf4" +checksum = "93107647184f6027e3b7dcb2e11034cf95ffa1e3a682c67951963ac69c1c007d" dependencies = [ "serde", ] @@ -3807,9 +3758,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "2.3.2" +version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "331bb8c3bf9b92457ab7abecf07078c13f7d270ba490103e84e8b014490cd0b0" +checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe" dependencies = [ "base64 0.13.1", "chrono", @@ -3823,14 +3774,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "2.3.2" +version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "859011bddcc11f289f07f467cc1fe01c7a941daa4d8f6c40d4d1c92eb6d9319c" +checksum = "881b6f881b17d13214e5d494c939ebab463d01264ce1811e9d4ac3a882e7695f" dependencies = [ "darling", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.16", ] [[package]] @@ -3944,9 +3895,9 @@ dependencies = [ [[package]] name = "socket2" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d283f86695ae989d1e18440a943880967156325ba025f05049946bff47bcc2b" +checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877" dependencies = [ "libc", "windows-sys 0.48.0", @@ -3986,7 +3937,7 @@ dependencies = [ "anyhow", "async-stream", "bytes", - "clap 4.2.2", + "clap 4.3.0", "const_format", "futures", "futures-core", @@ -4000,8 +3951,8 @@ dependencies = [ "prost", "tokio", "tokio-stream", - "tonic 0.9.1", - "tonic-build 0.9.1", + "tonic 0.9.2", + "tonic-build 0.9.2", "tracing", "utils", "workspace_hack", @@ -4044,9 +3995,9 @@ dependencies = [ [[package]] name = "subtle" -version = "2.4.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "svg_fmt" @@ -4067,9 +4018,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.15" +version = "2.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822" +checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01" dependencies = [ "proc-macro2", "quote", @@ -4123,7 +4074,7 @@ dependencies = [ "cfg-if", "fastrand", "redox_syscall 0.3.5", - "rustix 0.37.11", + "rustix 0.37.19", "windows-sys 0.45.0", ] @@ -4190,7 +4141,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -4205,9 +4156,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.20" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890" +checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc" dependencies = [ "itoa", "serde", @@ -4217,15 +4168,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" [[package]] name = "time-macros" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36" +checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b" dependencies = [ "time-core", ] @@ -4271,9 +4222,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.27.0" +version = "1.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0de47a4eecbe11f498978a9b29d792f0d2692d1dd003650c24c76510e3bc001" +checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105" dependencies = [ "autocfg", "bytes", @@ -4284,7 +4235,7 @@ dependencies = [ "signal-hook-registry", "socket2 0.4.9", "tokio-macros", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] @@ -4299,13 +4250,13 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.0.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61a573bdc87985e9d6ddeed1b3d864e8a302c847e40d647746df2f1de209d1ce" +checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -4321,7 +4272,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" dependencies = [ "async-trait", "byteorder", @@ -4372,15 +4323,15 @@ version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" dependencies = [ - "rustls 0.21.0", + "rustls 0.21.1", "tokio", ] [[package]] name = "tokio-stream" -version = "0.1.12" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313" +checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" dependencies = [ "futures-core", "pin-project-lite", @@ -4415,9 +4366,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.7" +version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2" +checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" dependencies = [ "bytes", "futures-core", @@ -4429,9 +4380,9 @@ dependencies = [ [[package]] name = "toml" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b403acf6f2bb0859c93c7f0d967cb4a75a7ac552100f9322faf64dc047669b21" +checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec" dependencies = [ "serde", "serde_spanned", @@ -4441,18 +4392,18 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ab8ed2edee10b50132aed5f331333428b011c99402b5a534154ed15746f9622" +checksum = "5a76a9312f5ba4c2dec6b9161fdf25d87ad8a09256ccea5a556fef03c706a10f" dependencies = [ "serde", ] [[package]] name = "toml_edit" -version = "0.19.8" +version = "0.19.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "239410c8609e8125456927e6707163a3b1fdb40561e4b803bc041f466ccfdc13" +checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739" dependencies = [ "indexmap", "serde", @@ -4495,14 +4446,14 @@ dependencies = [ [[package]] name = "tonic" -version = "0.9.1" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38bd8e87955eb13c1986671838177d6792cdc52af9bffced0d2c8a9a7f741ab3" +checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a" dependencies = [ "async-stream", "async-trait", "axum", - "base64 0.21.0", + "base64 0.21.1", "bytes", "futures-core", "futures-util", @@ -4540,9 +4491,9 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.9.1" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f60a933bbea70c95d633c04c951197ddf084958abaa2ed502a3743bdd8d8dd7" +checksum = "a6fdaae4c2c638bb70fe42803a26fbd6fc6ac8c72f5c59f67ecc2a2dcabf4b07" dependencies = [ "prettyplease 0.1.25", "proc-macro2", @@ -4588,7 +4539,7 @@ name = "trace" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.2.2", + "clap 4.3.0", "pageserver_api", "utils", "workspace_hack", @@ -4609,20 +4560,20 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" +checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.16", ] [[package]] name = "tracing-core" -version = "0.1.30" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" +checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" dependencies = [ "once_cell", "valuable", @@ -4685,9 +4636,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.16" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70" +checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" dependencies = [ "matchers", "nu-ansi-term", @@ -4777,9 +4728,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-ident" -version = "1.0.8" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" +checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" [[package]] name = "unicode-normalization" @@ -4899,9 +4850,9 @@ dependencies = [ [[package]] name = "uuid" -version = "1.3.1" +version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b55a3fef2a1e3b3a00ce878640918820d3c51081576ac657d23af9fc7928fdb" +checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2" dependencies = [ "getrandom", "serde", @@ -4936,13 +4887,15 @@ name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.2.2", + "clap 4.3.0", "env_logger", "log", "once_cell", "postgres", "postgres_ffi", + "regex", "tempfile", + "utils", "workspace_hack", ] @@ -4974,9 +4927,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" +checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -4984,24 +4937,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" +checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.16", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.34" +version = "0.4.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454" +checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e" dependencies = [ "cfg-if", "js-sys", @@ -5011,9 +4964,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" +checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -5021,28 +4974,28 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" +checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.16", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" +checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93" [[package]] name = "web-sys" -version = "0.3.61" +version = "0.3.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33b99f4b23ba3eec1a53ac264e35a755f00e966e0065077d6027c0f575b0b97" +checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2" dependencies = [ "js-sys", "wasm-bindgen", @@ -5276,9 +5229,9 @@ checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" [[package]] name = "winnow" -version = "0.4.1" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae8970b36c66498d8ff1d66685dc86b91b29db0c7739899012f63a63814b4b28" +checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699" dependencies = [ "memchr", ] @@ -5299,7 +5252,7 @@ dependencies = [ "anyhow", "bytes", "chrono", - "clap 4.2.2", + "clap 4.3.0", "clap_builder", "crossbeam-utils", "either", @@ -5310,7 +5263,6 @@ dependencies = [ "futures-executor", "futures-sink", "futures-util", - "hashbrown 0.12.3", "itertools", "libc", "log", @@ -5322,7 +5274,7 @@ dependencies = [ "prost", "rand", "regex", - "regex-syntax", + "regex-syntax 0.7.2", "reqwest", "ring", "rustls 0.20.8", @@ -5331,7 +5283,7 @@ dependencies = [ "serde_json", "socket2 0.4.9", "syn 1.0.109", - "syn 2.0.15", + "syn 2.0.16", "tokio", "tokio-rustls 0.23.4", "tokio-util", diff --git a/Cargo.toml b/Cargo.toml index 7895459841..dc34705f8d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,12 +3,26 @@ members = [ "compute_tools", "control_plane", "pageserver", + "pageserver/ctl", "proxy", "safekeeper", "storage_broker", "workspace_hack", "trace", - "libs/*", + "libs/compute_api", + "libs/pageserver_api", + "libs/postgres_ffi", + "libs/safekeeper_api", + "libs/utils", + "libs/consumption_metrics", + "libs/postgres_backend", + "libs/pq_proto", + "libs/tenant_size_model", + "libs/metrics", + "libs/postgres_connection", + "libs/remote_storage", + "libs/tracing-utils", + "libs/postgres_ffi/wal_craft", ] [workspace.package] @@ -22,7 +36,7 @@ async-stream = "0.3" async-trait = "0.1" atty = "0.2.14" aws-config = { version = "0.55", default-features = false, features=["rustls"] } -aws-sdk-s3 = "0.25" +aws-sdk-s3 = "0.27" aws-smithy-http = "0.55" aws-credential-types = "0.55" aws-types = "0.55" @@ -126,11 +140,11 @@ env_logger = "0.10" log = "0.4" ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" } -postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" } -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" } -postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" } +postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" } +postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" } tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" } ## Other git libraries @@ -166,7 +180,7 @@ tonic-build = "0.9" # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" } # Changes the MAX_THREADS limit from 4096 to 32768. # This is a temporary workaround for using tracing from many threads in safekeepers code, diff --git a/Dockerfile b/Dockerfile index 7364654641..9467e41ae4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,8 +47,7 @@ RUN set -e \ && mold -run cargo build \ --bin pg_sni_router \ --bin pageserver \ - --bin pageserver_binutils \ - --bin draw_timeline_dir \ + --bin pagectl \ --bin safekeeper \ --bin storage_broker \ --bin proxy \ @@ -73,8 +72,7 @@ RUN set -e \ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin -COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin -COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 3a3dee8a8a..44e13a6c73 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -517,6 +517,22 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405 cargo pgx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control +######################################################################################### +# +# Layer "pg-pgx-ulid-build" +# Compile "pgx_ulid" extension +# +######################################################################################### + +FROM rust-extensions-build AS pg-pgx-ulid-build + +RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \ + echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \ + mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ + sed -i 's/pgx = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + cargo pgx install --release && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -547,6 +563,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -556,6 +573,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_utils \ + -s install && \ + make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/hnsw \ -s install ######################################################################################### @@ -632,6 +653,7 @@ RUN apt update && \ libxml2 \ libxslt1.1 \ libzstd1 \ + libcurl4-openssl-dev \ procps && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 diff --git a/Dockerfile.vm-compute-node b/Dockerfile.vm-compute-node deleted file mode 100644 index aabb3c9953..0000000000 --- a/Dockerfile.vm-compute-node +++ /dev/null @@ -1,70 +0,0 @@ -# Note: this file *mostly* just builds on Dockerfile.compute-node - -ARG SRC_IMAGE -ARG VM_INFORMANT_VERSION=v0.1.14 -# on libcgroup update, make sure to check bootstrap.sh for changes -ARG LIBCGROUP_VERSION=v2.0.3 - -# Pull VM informant, to copy from later -FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant - -# Build cgroup-tools -# -# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically -# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-informant -# requires cgroup v2, so we'll build cgroup-tools ourselves. -FROM debian:bullseye-slim as libcgroup-builder -ARG LIBCGROUP_VERSION - -RUN set -exu \ - && apt update \ - && apt install --no-install-recommends -y \ - git \ - ca-certificates \ - automake \ - cmake \ - make \ - gcc \ - byacc \ - flex \ - libtool \ - libpam0g-dev \ - && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \ - && INSTALL_DIR="/libcgroup-install" \ - && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \ - && cd libcgroup \ - # extracted from bootstrap.sh, with modified flags: - && (test -d m4 || mkdir m4) \ - && autoreconf -fi \ - && rm -rf autom4te.cache \ - && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \ - # actually build the thing... - && make install - -# Combine, starting from non-VM compute node image. -FROM $SRC_IMAGE as base - -# Temporarily set user back to root so we can run adduser, set inittab -USER root -RUN adduser vm-informant --disabled-password --no-create-home - -RUN set -e \ - && rm -f /etc/inittab \ - && touch /etc/inittab - -RUN set -e \ - && echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \ - && CONNSTR="dbname=postgres user=cloud_admin sslmode=disable" \ - && ARGS="--auto-restart --cgroup=neon-postgres --pgconnstr=\"$CONNSTR\"" \ - && echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab - -USER postgres - -ADD vm-cgconfig.conf /etc/cgconfig.conf -COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant - -COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ -COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ -COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ - -ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"] diff --git a/Makefile b/Makefile index 9d78c5d0fc..ae979b8b4c 100644 --- a/Makefile +++ b/Makefile @@ -138,6 +138,11 @@ neon-pg-ext-%: postgres-% $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install + +@echo "Compiling hnsw $*" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$* + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install .PHONY: neon-pg-ext-clean-% neon-pg-ext-clean-%: @@ -153,6 +158,9 @@ neon-pg-ext-clean-%: $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ + -C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean .PHONY: neon-pg-ext neon-pg-ext: \ diff --git a/README.md b/README.md index 8e6f2cda81..efa714e5be 100644 --- a/README.md +++ b/README.md @@ -28,18 +28,19 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati * On Ubuntu or Debian, this set of packages should be sufficient to build the code: ```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ -libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler +libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \ +libcurl4-openssl-dev ``` * On Fedora, these packages are needed: ```bash dnf install flex bison readline-devel zlib-devel openssl-devel \ libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \ - protobuf-devel + protobuf-devel libcurl-devel ``` * On Arch based systems, these packages are needed: ```bash pacman -S base-devel readline zlib libseccomp openssl clang \ -postgresql-libs cmake postgresql protobuf +postgresql-libs cmake postgresql protobuf curl ``` Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases). diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 2f515c9bf1..c6cfde1d1a 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -59,6 +59,9 @@ fn main() -> Result<()> { let matches = cli().get_matches(); + let http_port = *matches + .get_one::("http-port") + .expect("http-port is required"); let pgdata = matches .get_one::("pgdata") .expect("PGDATA path is required"); @@ -178,7 +181,8 @@ fn main() -> Result<()> { // Launch http service first, so we were able to serve control-plane // requests, while configuration is still in progress. - let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread"); + let _http_handle = + launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread"); if !spec_set { // No spec provided, hang waiting for it. @@ -286,6 +290,14 @@ fn cli() -> clap::Command { let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown"); clap::Command::new("compute_ctl") .version(version) + .arg( + Arg::new("http-port") + .long("http-port") + .value_name("HTTP_PORT") + .default_value("3080") + .value_parser(clap::value_parser!(u16)) + .required(false), + ) .arg( Arg::new("connstr") .short('C') diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index da5ad00da6..617b330704 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,19 +1,3 @@ -// -// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`, -// but there are several things that makes `PostgresNode` usage inconvenient in the -// cloud: -// - it inherits from `LocalEnv`, which contains **all-all** the information about -// a complete service running -// - it uses `PageServerNode` with information about http endpoint, which we do not -// need in the cloud again -// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud -// -// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required -// attributes (not required for the cloud). Yet, it is still tempting to unify these -// `PostgresNode` and `ComputeNode` and use one in both places. -// -// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`. -// use std::fs; use std::os::unix::fs::PermissionsExt; use std::path::Path; @@ -106,26 +90,38 @@ pub struct ParsedSpec { impl TryFrom for ParsedSpec { type Error = String; fn try_from(spec: ComputeSpec) -> Result { + // Extract the options from the spec file that are needed to connect to + // the storage system. + // + // For backwards-compatibility, the top-level fields in the spec file + // may be empty. In that case, we need to dig them from the GUCs in the + // cluster.settings field. let pageserver_connstr = spec - .cluster - .settings - .find("neon.pageserver_connstring") + .pageserver_connstring + .clone() + .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring")) .ok_or("pageserver connstr should be provided")?; let storage_auth_token = spec.storage_auth_token.clone(); - let tenant_id: TenantId = spec - .cluster - .settings - .find("neon.tenant_id") - .ok_or("tenant id should be provided") - .map(|s| TenantId::from_str(&s))? - .or(Err("invalid tenant id"))?; - let timeline_id: TimelineId = spec - .cluster - .settings - .find("neon.timeline_id") - .ok_or("timeline id should be provided") - .map(|s| TimelineId::from_str(&s))? - .or(Err("invalid timeline id"))?; + let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id { + tenant_id + } else { + spec.cluster + .settings + .find("neon.tenant_id") + .ok_or("tenant id should be provided") + .map(|s| TenantId::from_str(&s))? + .or(Err("invalid tenant id"))? + }; + let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id { + timeline_id + } else { + spec.cluster + .settings + .find("neon.timeline_id") + .ok_or("timeline id should be provided") + .map(|s| TimelineId::from_str(&s))? + .or(Err("invalid timeline id"))? + }; Ok(ParsedSpec { spec, @@ -295,8 +291,8 @@ impl ComputeNode { update_pg_hba(pgdata_path)?; match spec.mode { - ComputeMode::Primary | ComputeMode::Static(..) => {} - ComputeMode::Replica => { + ComputeMode::Primary => {} + ComputeMode::Replica | ComputeMode::Static(..) => { add_standby_signal(pgdata_path)?; } } @@ -362,6 +358,8 @@ impl ComputeNode { }; // Proceed with post-startup configuration. Note, that order of operations is important. + // Disable DDL forwarding because control plane already knows about these roles/databases. + client.simple_query("SET neon.forward_ddl = false")?; let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; handle_roles(spec, &mut client)?; handle_databases(spec, &mut client)?; @@ -374,7 +372,7 @@ impl ComputeNode { info!( "finished configuration of compute for project {}", - spec.cluster.cluster_id + spec.cluster.cluster_id.as_deref().unwrap_or("None") ); Ok(()) @@ -403,7 +401,9 @@ impl ComputeNode { self.pg_reload_conf(&mut client)?; // Proceed with post-startup configuration. Note, that order of operations is important. + // Disable DDL forwarding because control plane already knows about these roles/databases. if spec.mode == ComputeMode::Primary { + client.simple_query("SET neon.forward_ddl = false")?; handle_roles(&spec, &mut client)?; handle_databases(&spec, &mut client)?; handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; @@ -430,7 +430,7 @@ impl ComputeNode { let spec = compute_state.pspec.as_ref().expect("spec must be set"); info!( "starting compute for project {}, operation {}, tenant {}, timeline {}", - spec.spec.cluster.cluster_id, + spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), spec.spec.operation_uuid.as_deref().unwrap_or("None"), spec.tenant_id, spec.timeline_id, diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 1168f3876a..99346433d0 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -5,6 +5,7 @@ use std::path::Path; use anyhow::Result; +use crate::pg_helpers::escape_conf_value; use crate::pg_helpers::PgOptionsSerialize; use compute_api::spec::{ComputeMode, ComputeSpec}; @@ -36,10 +37,44 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> { // File::create() destroys the file content if it exists. let mut file = File::create(path)?; - writeln!(file, "# Managed by compute_ctl: begin")?; + // Write the postgresql.conf content from the spec file as is. + if let Some(conf) = &spec.cluster.postgresql_conf { + writeln!(file, "{}", conf)?; + } write!(file, "{}", &spec.cluster.settings.as_pg_settings())?; + // Add options for connecting to storage + writeln!(file, "# Neon storage settings")?; + if let Some(s) = &spec.pageserver_connstring { + writeln!( + file, + "neon.pageserver_connstring='{}'", + escape_conf_value(s) + )?; + } + if !spec.safekeeper_connstrings.is_empty() { + writeln!( + file, + "neon.safekeepers='{}'", + escape_conf_value(&spec.safekeeper_connstrings.join(",")) + )?; + } + if let Some(s) = &spec.tenant_id { + writeln!( + file, + "neon.tenant_id='{}'", + escape_conf_value(&s.to_string()) + )?; + } + if let Some(s) = &spec.timeline_id { + writeln!( + file, + "neon.timeline_id='{}'", + escape_conf_value(&s.to_string()) + )?; + } + match spec.mode { ComputeMode::Primary => {} ComputeMode::Static(lsn) => { @@ -53,7 +88,12 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> { } } - writeln!(file, "# Managed by compute_ctl: end")?; + // If there are any extra options in the 'settings' field, append those + if spec.cluster.settings.is_some() { + writeln!(file, "# Managed by compute_ctl: begin")?; + write!(file, "{}", spec.cluster.settings.as_pg_settings())?; + writeln!(file, "# Managed by compute_ctl: end")?; + } Ok(()) } diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 4468f6f5e4..afd9c2fb54 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -220,8 +220,8 @@ fn render_json_error(e: &str, status: StatusCode) -> Response { // Main Hyper HTTP server function that runs it and blocks waiting on it forever. #[tokio::main] -async fn serve(state: Arc) { - let addr = SocketAddr::from(([0, 0, 0, 0], 3080)); +async fn serve(port: u16, state: Arc) { + let addr = SocketAddr::from(([0, 0, 0, 0], port)); let make_service = make_service_fn(move |_conn| { let state = state.clone(); @@ -256,10 +256,10 @@ async fn serve(state: Arc) { } /// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`. -pub fn launch_http_server(state: &Arc) -> Result> { +pub fn launch_http_server(port: u16, state: &Arc) -> Result> { let state = Arc::clone(state); Ok(thread::Builder::new() .name("http-endpoint".into()) - .spawn(move || serve(state))?) + .spawn(move || serve(port, state))?) } diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index 1b5cf647b0..f6fc882968 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -33,5 +33,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { .init(); tracing::info!("logging and tracing started"); + utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); + Ok(()) } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 40dbea6907..d5c845e9ea 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -23,7 +23,7 @@ fn escape_literal(s: &str) -> String { /// Escape a string so that it can be used in postgresql.conf. /// Same as escape_literal, currently. -fn escape_conf_value(s: &str) -> String { +pub fn escape_conf_value(s: &str) -> String { s.replace('\'', "''").replace('\\', "\\\\") } @@ -121,9 +121,8 @@ impl RoleExt for Role { /// string of arguments. fn to_pg_options(&self) -> String { // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane. - // For now, we do not use generic `options` for roles. Once used, add - // `self.options.as_pg_options()` somewhere here. - let mut params: String = "LOGIN".to_string(); + let mut params: String = self.options.as_pg_options(); + params.push_str(" LOGIN"); if let Some(pass) = &self.encrypted_password { // Some time ago we supported only md5 and treated all encrypted_password as md5. diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index bf3c407202..a2a19ae0da 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -62,7 +62,7 @@ fn do_control_plane_request( } } -/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT` +/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN` /// env variable is set, it will be used for authorization. pub fn get_spec_from_control_plane( base_uri: &str, diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index a63ee038c7..265556d3b9 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -16,7 +16,7 @@ mod pg_helpers_tests { ); assert_eq!( spec.cluster.roles.first().unwrap().to_pg_options(), - "LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'" + " LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'" ); } diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 30880565ab..52af936d7b 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -41,7 +41,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); -const DEFAULT_PG_VERSION: &str = "14"; +const DEFAULT_PG_VERSION: &str = "15"; fn default_conf() -> String { format!( @@ -476,10 +476,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - println!("Creating endpoint for imported timeline ..."); cplane.new_endpoint( - tenant_id, name, + tenant_id, timeline_id, None, + None, pg_version, ComputeMode::Primary, )?; @@ -591,7 +592,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( table.add_row([ endpoint_id.as_str(), - &endpoint.address.to_string(), + &endpoint.pg_address.to_string(), &endpoint.timeline_id.to_string(), branch_name, lsn_str.as_str(), @@ -620,8 +621,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .get_branch_timeline_id(branch_name, tenant_id) .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?; - let port: Option = sub_args.get_one::("port").copied(); - + let pg_port: Option = sub_args.get_one::("pg-port").copied(); + let http_port: Option = sub_args.get_one::("http-port").copied(); let pg_version = sub_args .get_one::("pg-version") .copied() @@ -639,14 +640,38 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), }; - cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?; + cplane.new_endpoint( + &endpoint_id, + tenant_id, + timeline_id, + pg_port, + http_port, + pg_version, + mode, + )?; } "start" => { - let port: Option = sub_args.get_one::("port").copied(); + let pg_port: Option = sub_args.get_one::("pg-port").copied(); + let http_port: Option = sub_args.get_one::("http-port").copied(); let endpoint_id = sub_args .get_one::("endpoint_id") .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?; + // If --safekeepers argument is given, use only the listed safekeeper nodes. + let safekeepers = + if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { + let mut safekeepers: Vec = Vec::new(); + for sk_id in safekeepers_str.split(',').map(str::trim) { + let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| { + anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list") + })?); + safekeepers.push(sk_id); + } + safekeepers + } else { + env.safekeepers.iter().map(|sk| sk.id).collect() + }; + let endpoint = cplane.endpoints.get(endpoint_id.as_str()); let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) { @@ -673,7 +698,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( _ => {} } println!("Starting existing endpoint {endpoint_id}..."); - endpoint.start(&auth_token)?; + endpoint.start(&auth_token, safekeepers)?; } else { let branch_name = sub_args .get_one::("branch-name") @@ -709,14 +734,15 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ..."); let ep = cplane.new_endpoint( - tenant_id, endpoint_id, + tenant_id, timeline_id, - port, + pg_port, + http_port, pg_version, mode, )?; - ep.start(&auth_token)?; + ep.start(&auth_token, safekeepers)?; } } "stop" => { @@ -944,11 +970,22 @@ fn cli() -> Command { .value_parser(value_parser!(u32)) .default_value(DEFAULT_PG_VERSION); - let port_arg = Arg::new("port") - .long("port") + let pg_port_arg = Arg::new("pg-port") + .long("pg-port") .required(false) .value_parser(value_parser!(u16)) - .value_name("port"); + .value_name("pg-port"); + + let http_port_arg = Arg::new("http-port") + .long("http-port") + .required(false) + .value_parser(value_parser!(u16)) + .value_name("http-port"); + + let safekeepers_arg = Arg::new("safekeepers") + .long("safekeepers") + .required(false) + .value_name("safekeepers"); let stop_mode_arg = Arg::new("stop-mode") .short('m') @@ -1093,7 +1130,8 @@ fn cli() -> Command { .arg(branch_name_arg.clone()) .arg(tenant_id_arg.clone()) .arg(lsn_arg.clone()) - .arg(port_arg.clone()) + .arg(pg_port_arg.clone()) + .arg(http_port_arg.clone()) .arg( Arg::new("config-only") .help("Don't do basebackup, create endpoint directory with only config files") @@ -1109,9 +1147,11 @@ fn cli() -> Command { .arg(branch_name_arg) .arg(timeline_id_arg) .arg(lsn_arg) - .arg(port_arg) + .arg(pg_port_arg) + .arg(http_port_arg) .arg(pg_version_arg) .arg(hot_standby_arg) + .arg(safekeepers_arg) ) .subcommand( Command::new("stop") diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs index 6c0604a076..ad19dfa204 100644 --- a/control_plane/src/broker.rs +++ b/control_plane/src/broker.rs @@ -1,3 +1,9 @@ +//! Code to manage the storage broker +//! +//! In the local test environment, the data for each safekeeper is stored in +//! +//! .neon/safekeepers/ +//! use anyhow::Context; use std::path::PathBuf; diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index cc5a7a4168..b28315a35d 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -1,40 +1,71 @@ +//! Code to manage compute endpoints +//! +//! In the local test environment, the data for each endpoint is stored in +//! +//! .neon/endpoints/ +//! +//! Some basic information about the endpoint, like the tenant and timeline IDs, +//! are stored in the `endpoint.json` file. The `endpoint.json` file is created +//! when the endpoint is created, and doesn't change afterwards. +//! +//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is +//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads +//! the basebackup from the pageserver to initialize the the data directory, and +//! finally launches the PostgreSQL process. It watches the PostgreSQL process +//! until it exits. +//! +//! When an endpoint is created, a `postgresql.conf` file is also created in +//! the endpoint's directory. The file can be modified before starting PostgreSQL. +//! However, the `postgresql.conf` file in the endpoint directory is not used directly +//! by PostgreSQL. It is passed to `compute_ctl`, and `compute_ctl` writes another +//! copy of it in the data directory. +//! +//! Directory contents: +//! +//! ```ignore +//! .neon/endpoints/main/ +//! compute.log - log output of `compute_ctl` and `postgres` +//! endpoint.json - serialized `EndpointConf` struct +//! postgresql.conf - postgresql settings +//! spec.json - passed to `compute_ctl` +//! pgdata/ +//! postgresql.conf - copy of postgresql.conf created by `compute_ctl` +//! zenith.signal +//! +//! ``` +//! use std::collections::BTreeMap; -use std::fs::{self, File}; -use std::io::Write; use std::net::SocketAddr; use std::net::TcpStream; -use std::os::unix::fs::PermissionsExt; use std::path::PathBuf; -use std::process::{Command, Stdio}; -use std::str::FromStr; +use std::process::Command; use std::sync::Arc; use std::time::Duration; -use anyhow::{Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{NodeId, TenantId, TimelineId}; use crate::local_env::LocalEnv; use crate::pageserver::PageServerNode; use crate::postgresql_conf::PostgresConf; -use compute_api::spec::ComputeMode; +use compute_api::responses::{ComputeState, ComputeStatus}; +use compute_api::spec::{Cluster, ComputeMode, ComputeSpec}; // contents of a endpoint.json file #[serde_as] #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] pub struct EndpointConf { - name: String, + endpoint_id: String, #[serde_as(as = "DisplayFromStr")] tenant_id: TenantId, #[serde_as(as = "DisplayFromStr")] timeline_id: TimelineId, mode: ComputeMode, - port: u16, + pg_port: u16, + http_port: u16, pg_version: u32, } @@ -57,11 +88,11 @@ impl ComputeControlPlane { let pageserver = Arc::new(PageServerNode::from_env(&env)); let mut endpoints = BTreeMap::default(); - for endpoint_dir in fs::read_dir(env.endpoints_path()) + for endpoint_dir in std::fs::read_dir(env.endpoints_path()) .with_context(|| format!("failed to list {}", env.endpoints_path().display()))? { let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?; - endpoints.insert(ep.name.clone(), Arc::new(ep)); + endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep)); } Ok(ComputeControlPlane { @@ -76,25 +107,28 @@ impl ComputeControlPlane { 1 + self .endpoints .values() - .map(|ep| ep.address.port()) + .map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port())) .max() .unwrap_or(self.base_port) } + #[allow(clippy::too_many_arguments)] pub fn new_endpoint( &mut self, + endpoint_id: &str, tenant_id: TenantId, - name: &str, timeline_id: TimelineId, - port: Option, + pg_port: Option, + http_port: Option, pg_version: u32, mode: ComputeMode, ) -> Result> { - let port = port.unwrap_or_else(|| self.get_port()); - + let pg_port = pg_port.unwrap_or_else(|| self.get_port()); + let http_port = http_port.unwrap_or_else(|| self.get_port() + 1); let ep = Arc::new(Endpoint { - name: name.to_owned(), - address: SocketAddr::new("127.0.0.1".parse().unwrap(), port), + endpoint_id: endpoint_id.to_owned(), + pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port), + http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port), env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), timeline_id, @@ -102,21 +136,27 @@ impl ComputeControlPlane { tenant_id, pg_version, }); - ep.create_pgdata()?; + + ep.create_endpoint_dir()?; std::fs::write( ep.endpoint_path().join("endpoint.json"), serde_json::to_string_pretty(&EndpointConf { - name: name.to_string(), + endpoint_id: endpoint_id.to_string(), tenant_id, timeline_id, mode, - port, + http_port, + pg_port, pg_version, })?, )?; - ep.setup_pg_conf()?; + std::fs::write( + ep.endpoint_path().join("postgresql.conf"), + ep.setup_pg_conf()?.to_string(), + )?; - self.endpoints.insert(ep.name.clone(), Arc::clone(&ep)); + self.endpoints + .insert(ep.endpoint_id.clone(), Arc::clone(&ep)); Ok(ep) } @@ -127,13 +167,15 @@ impl ComputeControlPlane { #[derive(Debug)] pub struct Endpoint { /// used as the directory name - name: String, + endpoint_id: String, pub tenant_id: TenantId, pub timeline_id: TimelineId, pub mode: ComputeMode, - // port and address of the Postgres server - pub address: SocketAddr, + // port and address of the Postgres server and `compute_ctl`'s HTTP API + pub pg_address: SocketAddr, + pub http_address: SocketAddr, + // postgres major version in the format: 14, 15, etc. pg_version: u32, @@ -158,16 +200,16 @@ impl Endpoint { // parse data directory name let fname = entry.file_name(); - let name = fname.to_str().unwrap().to_string(); + let endpoint_id = fname.to_str().unwrap().to_string(); // Read the endpoint.json file let conf: EndpointConf = serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; - // ok now Ok(Endpoint { - address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port), - name, + pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port), + http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port), + endpoint_id, env: env.clone(), pageserver: Arc::clone(pageserver), timeline_id: conf.timeline_id, @@ -177,104 +219,17 @@ impl Endpoint { }) } - fn sync_safekeepers(&self, auth_token: &Option, pg_version: u32) -> Result { - let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres"); - let mut cmd = Command::new(pg_path); - - cmd.arg("--sync-safekeepers") - .env_clear() - .env( - "LD_LIBRARY_PATH", - self.env.pg_lib_dir(pg_version)?.to_str().unwrap(), - ) - .env( - "DYLD_LIBRARY_PATH", - self.env.pg_lib_dir(pg_version)?.to_str().unwrap(), - ) - .env("PGDATA", self.pgdata().to_str().unwrap()) - .stdout(Stdio::piped()) - // Comment this to avoid capturing stderr (useful if command hangs) - .stderr(Stdio::piped()); - - if let Some(token) = auth_token { - cmd.env("NEON_AUTH_TOKEN", token); - } - - let sync_handle = cmd - .spawn() - .expect("postgres --sync-safekeepers failed to start"); - - let sync_output = sync_handle - .wait_with_output() - .expect("postgres --sync-safekeepers failed"); - if !sync_output.status.success() { - anyhow::bail!( - "sync-safekeepers failed: '{}'", - String::from_utf8_lossy(&sync_output.stderr) - ); - } - - let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?; - println!("Safekeepers synced on {}", lsn); - Ok(lsn) - } - - /// Get basebackup from the pageserver as a tar archive and extract it - /// to the `self.pgdata()` directory. - fn do_basebackup(&self, lsn: Option) -> Result<()> { - println!( - "Extracting base backup to create postgres instance: path={} port={}", - self.pgdata().display(), - self.address.port() - ); - - let sql = if let Some(lsn) = lsn { - format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn) - } else { - format!("basebackup {} {}", self.tenant_id, self.timeline_id) - }; - - let mut client = self - .pageserver - .page_server_psql_client() - .context("connecting to page server failed")?; - - let copyreader = client - .copy_out(sql.as_str()) - .context("page server 'basebackup' command failed")?; - - // Read the archive directly from the `CopyOutReader` - // - // Set `ignore_zeros` so that unpack() reads all the Copy data and - // doesn't stop at the end-of-archive marker. Otherwise, if the server - // sends an Error after finishing the tarball, we will not notice it. - let mut ar = tar::Archive::new(copyreader); - ar.set_ignore_zeros(true); - ar.unpack(&self.pgdata()) - .context("extracting base backup failed")?; - - Ok(()) - } - - fn create_pgdata(&self) -> Result<()> { - fs::create_dir_all(self.pgdata()).with_context(|| { + fn create_endpoint_dir(&self) -> Result<()> { + std::fs::create_dir_all(self.endpoint_path()).with_context(|| { format!( - "could not create data directory {}", - self.pgdata().display() + "could not create endpoint directory {}", + self.endpoint_path().display() ) - })?; - fs::set_permissions(self.pgdata().as_path(), fs::Permissions::from_mode(0o700)) - .with_context(|| { - format!( - "could not set permissions in data directory {}", - self.pgdata().display() - ) - }) + }) } - // Write postgresql.conf with default configuration - // and PG_VERSION file to the data directory of a new endpoint. - fn setup_pg_conf(&self) -> Result<()> { + // Generate postgresql.conf with default configuration + fn setup_pg_conf(&self) -> Result { let mut conf = PostgresConf::new(); conf.append("max_wal_senders", "10"); conf.append("wal_log_hints", "off"); @@ -287,25 +242,14 @@ impl Endpoint { // wal_sender_timeout is the maximum time to wait for WAL replication. // It also defines how often the walreciever will send a feedback message to the wal sender. conf.append("wal_sender_timeout", "5s"); - conf.append("listen_addresses", &self.address.ip().to_string()); - conf.append("port", &self.address.port().to_string()); + conf.append("listen_addresses", &self.pg_address.ip().to_string()); + conf.append("port", &self.pg_address.port().to_string()); conf.append("wal_keep_size", "0"); // walproposer panics when basebackup is invalid, it is pointless to restart in this case. conf.append("restart_after_crash", "off"); - // Configure the Neon Postgres extension to fetch pages from pageserver - let pageserver_connstr = { - let config = &self.pageserver.pg_connection_config; - let (host, port) = (config.host(), config.port()); - - // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere. - format!("postgresql://no_user@{host}:{port}") - }; + // Load the 'neon' extension conf.append("shared_preload_libraries", "neon"); - conf.append_line(""); - conf.append("neon.pageserver_connstring", &pageserver_connstr); - conf.append("neon.tenant_id", &self.tenant_id.to_string()); - conf.append("neon.timeline_id", &self.timeline_id.to_string()); conf.append_line(""); // Replication-related configurations, such as WAL sending @@ -390,46 +334,11 @@ impl Endpoint { } } - let mut file = File::create(self.pgdata().join("postgresql.conf"))?; - file.write_all(conf.to_string().as_bytes())?; - - let mut file = File::create(self.pgdata().join("PG_VERSION"))?; - file.write_all(self.pg_version.to_string().as_bytes())?; - - Ok(()) - } - - fn load_basebackup(&self, auth_token: &Option) -> Result<()> { - let backup_lsn = match &self.mode { - ComputeMode::Primary => { - if !self.env.safekeepers.is_empty() { - // LSN 0 means that it is bootstrap and we need to download just - // latest data from the pageserver. That is a bit clumsy but whole bootstrap - // procedure evolves quite actively right now, so let's think about it again - // when things would be more stable (TODO). - let lsn = self.sync_safekeepers(auth_token, self.pg_version)?; - if lsn == Lsn(0) { - None - } else { - Some(lsn) - } - } else { - None - } - } - ComputeMode::Static(lsn) => Some(*lsn), - ComputeMode::Replica => { - None // Take the latest snapshot available to start with - } - }; - - self.do_basebackup(backup_lsn)?; - - Ok(()) + Ok(conf) } pub fn endpoint_path(&self) -> PathBuf { - self.env.endpoints_path().join(&self.name) + self.env.endpoints_path().join(&self.endpoint_id) } pub fn pgdata(&self) -> PathBuf { @@ -439,7 +348,7 @@ impl Endpoint { pub fn status(&self) -> &str { let timeout = Duration::from_millis(300); let has_pidfile = self.pgdata().join("postmaster.pid").exists(); - let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok(); + let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok(); match (has_pidfile, can_connect) { (true, true) => "running", @@ -457,8 +366,6 @@ impl Endpoint { &[ "-D", self.pgdata().to_str().unwrap(), - "-l", - self.pgdata().join("pg.log").to_str().unwrap(), "-w", //wait till pg_ctl actually does what was asked ], args, @@ -494,36 +401,183 @@ impl Endpoint { Ok(()) } - pub fn start(&self, auth_token: &Option) -> Result<()> { + pub fn start(&self, auth_token: &Option, safekeepers: Vec) -> Result<()> { if self.status() == "running" { anyhow::bail!("The endpoint is already running"); } - // 1. We always start Postgres from scratch, so - // if old dir exists, preserve 'postgresql.conf' and drop the directory - let postgresql_conf_path = self.pgdata().join("postgresql.conf"); - let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| { - format!( - "failed to read config file in {}", - postgresql_conf_path.to_str().unwrap() - ) - })?; - fs::remove_dir_all(self.pgdata())?; - self.create_pgdata()?; + // Slurp the endpoints//postgresql.conf file into + // memory. We will include it in the spec file that we pass to + // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf + // in the data directory. + let postgresql_conf_path = self.endpoint_path().join("postgresql.conf"); + let postgresql_conf = match std::fs::read(&postgresql_conf_path) { + Ok(content) => String::from_utf8(content)?, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(), + Err(e) => { + return Err(anyhow::Error::new(e).context(format!( + "failed to read config file in {}", + postgresql_conf_path.to_str().unwrap() + ))) + } + }; - // 2. Bring back config files - fs::write(&postgresql_conf_path, postgresql_conf)?; - - // 3. Load basebackup - self.load_basebackup(auth_token)?; - - if self.mode != ComputeMode::Primary { - File::create(self.pgdata().join("standby.signal"))?; + // We always start the compute node from scratch, so if the Postgres + // data dir exists from a previous launch, remove it first. + if self.pgdata().exists() { + std::fs::remove_dir_all(self.pgdata())?; } - // 4. Finally start postgres - println!("Starting postgres at '{}'", self.connstr()); - self.pg_ctl(&["start"], auth_token) + let pageserver_connstring = { + let config = &self.pageserver.pg_connection_config; + let (host, port) = (config.host(), config.port()); + + // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere. + format!("postgresql://no_user@{host}:{port}") + }; + let mut safekeeper_connstrings = Vec::new(); + if self.mode == ComputeMode::Primary { + for sk_id in safekeepers { + let sk = self + .env + .safekeepers + .iter() + .find(|node| node.id == sk_id) + .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?; + safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port)); + } + } + + // Create spec file + let spec = ComputeSpec { + format_version: 1.0, + operation_uuid: None, + cluster: Cluster { + cluster_id: None, // project ID: not used + name: None, // project name: not used + state: None, + roles: vec![], + databases: vec![], + settings: None, + postgresql_conf: Some(postgresql_conf), + }, + delta_operations: None, + tenant_id: Some(self.tenant_id), + timeline_id: Some(self.timeline_id), + mode: self.mode, + pageserver_connstring: Some(pageserver_connstring), + safekeeper_connstrings, + storage_auth_token: auth_token.clone(), + }; + let spec_path = self.endpoint_path().join("spec.json"); + std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; + + // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it. + let logfile = std::fs::OpenOptions::new() + .create(true) + .append(true) + .open(self.endpoint_path().join("compute.log"))?; + + // Launch compute_ctl + println!("Starting postgres node at '{}'", self.connstr()); + let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); + cmd.args(["--http-port", &self.http_address.port().to_string()]) + .args(["--pgdata", self.pgdata().to_str().unwrap()]) + .args(["--connstr", &self.connstr()]) + .args([ + "--spec-path", + self.endpoint_path().join("spec.json").to_str().unwrap(), + ]) + .args([ + "--pgbin", + self.env + .pg_bin_dir(self.pg_version)? + .join("postgres") + .to_str() + .unwrap(), + ]) + .stdin(std::process::Stdio::null()) + .stderr(logfile.try_clone()?) + .stdout(logfile); + let _child = cmd.spawn()?; + + // Wait for it to start + let mut attempt = 0; + const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100); + const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s + loop { + attempt += 1; + match self.get_status() { + Ok(state) => { + match state.status { + ComputeStatus::Init => { + if attempt == MAX_ATTEMPTS { + bail!("compute startup timed out; still in Init state"); + } + // keep retrying + } + ComputeStatus::Running => { + // All good! + break; + } + ComputeStatus::Failed => { + bail!( + "compute startup failed: {}", + state + .error + .as_deref() + .unwrap_or("") + ); + } + ComputeStatus::Empty + | ComputeStatus::ConfigurationPending + | ComputeStatus::Configuration => { + bail!("unexpected compute status: {:?}", state.status) + } + } + } + Err(e) => { + if attempt == MAX_ATTEMPTS { + return Err(e).context( + "timed out waiting to connect to compute_ctl HTTP; last error: {e}", + ); + } + } + } + std::thread::sleep(ATTEMPT_INTERVAL); + } + + Ok(()) + } + + // Call the /status HTTP API + pub fn get_status(&self) -> Result { + let client = reqwest::blocking::Client::new(); + + let response = client + .request( + reqwest::Method::GET, + format!( + "http://{}:{}/status", + self.http_address.ip(), + self.http_address.port() + ), + ) + .send()?; + + // Interpret the response + let status = response.status(); + if !(status.is_client_error() || status.is_server_error()) { + Ok(response.json()?) + } else { + // reqwest does not export its error construction utility functions, so let's craft the message ourselves + let url = response.url().to_owned(); + let msg = match response.text() { + Ok(err_body) => format!("Error: {}", err_body), + Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), + }; + Err(anyhow::anyhow!(msg)) + } } pub fn stop(&self, destroy: bool) -> Result<()> { @@ -540,7 +594,7 @@ impl Endpoint { "Destroying postgres data directory '{}'", self.pgdata().to_str().unwrap() ); - fs::remove_dir_all(self.endpoint_path())?; + std::fs::remove_dir_all(self.endpoint_path())?; } else { self.pg_ctl(&["stop"], &None)?; } @@ -549,10 +603,10 @@ impl Endpoint { pub fn connstr(&self) -> String { format!( - "host={} port={} user={} dbname={}", - self.address.ip(), - self.address.port(), + "postgresql://{}@{}:{}/{}", "cloud_admin", + self.pg_address.ip(), + self.pg_address.port(), "postgres" ) } diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 2b1eec7c4b..df70cb3139 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -24,7 +24,7 @@ use utils::{ use crate::safekeeper::SafekeeperNode; -pub const DEFAULT_PG_VERSION: u32 = 14; +pub const DEFAULT_PG_VERSION: u32 = 15; // // This data structures represents neon_local CLI config @@ -37,7 +37,7 @@ pub const DEFAULT_PG_VERSION: u32 = 14; #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] pub struct LocalEnv { // Base directory for all the nodes (the pageserver, safekeepers and - // compute nodes). + // compute endpoints). // // This is not stored in the config file. Rather, this is the path where the // config file itself is. It is read from the NEON_REPO_DIR env variable or diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 6fc3c43842..2ff09021e5 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -1,3 +1,9 @@ +//! Code to manage pageservers +//! +//! In the local test environment, the pageserver stores its data directly in +//! +//! .neon/ +//! use std::borrow::Cow; use std::collections::HashMap; use std::fs::File; @@ -369,6 +375,11 @@ impl PageServerNode { evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") .map(|x| x.to_string()), + gc_feedback: settings + .remove("gc_feedback") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_feedback' as bool")?, }; // If tenant ID was not specified, generate one @@ -463,6 +474,11 @@ impl PageServerNode { evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") .map(|x| x.to_string()), + gc_feedback: settings + .remove("gc_feedback") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_feedback' as bool")?, } }; @@ -499,6 +515,9 @@ impl PageServerNode { ancestor_timeline_id: Option, pg_version: Option, ) -> anyhow::Result { + // If timeline ID was not specified, generate one + let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate()); + self.http_request( Method::POST, format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index d358f73343..9e053ff1f1 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -1,3 +1,9 @@ +//! Code to manage safekeepers +//! +//! In the local test environment, the data for each safekeeper is stored in +//! +//! .neon/safekeepers/ +//! use std::io::Write; use std::path::PathBuf; use std::process::Child; diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index cef2b485f3..22660a63ce 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -1,6 +1,14 @@ #!/bin/bash set -eux +# Generate a random tenant or timeline ID +# +# Takes a variable name as argument. The result is stored in that variable. +generate_id() { + local -n resvar=$1 + printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM +} + PG_VERSION=${PG_VERSION:-14} SPEC_FILE_ORG=/var/db/postgres/specs/spec.json @@ -13,29 +21,29 @@ done echo "Page server is ready." echo "Create a tenant and timeline" +generate_id tenant_id PARAMS=( -sb -X POST -H "Content-Type: application/json" - -d "{}" + -d "{\"new_tenant_id\": \"${tenant_id}\"}" http://pageserver:9898/v1/tenant/ ) -tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g') +result=$(curl "${PARAMS[@]}") +echo $result | jq . +generate_id timeline_id PARAMS=( -sb -X POST -H "Content-Type: application/json" - -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}" + -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}" "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/" ) result=$(curl "${PARAMS[@]}") echo $result | jq . echo "Overwrite tenant id and timeline id in spec file" -tenant_id=$(echo ${result} | jq -r .tenant_id) -timeline_id=$(echo ${result} | jq -r .timeline_id) - sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE} sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE} diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md index 0cc897f154..b911933528 100644 --- a/docs/pageserver-thread-mgmt.md +++ b/docs/pageserver-thread-mgmt.md @@ -52,9 +52,7 @@ completion, or shield the rest of the code from surprise cancellations by spawning a separate task. The code that handles incoming HTTP requests, for example, spawns a separate task for each request, because Hyper will drop the request-handling Future if the HTTP -connection is lost. (FIXME: our HTTP handlers do not do that -currently, but we should fix that. See [issue -3478](https://github.com/neondatabase/neon/issues/3478)). +connection is lost. #### How to cancel, then? diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index d181c018b1..ce73dda08a 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -5,13 +5,13 @@ use serde::{Deserialize, Serialize, Serializer}; use crate::spec::ComputeSpec; -#[derive(Serialize, Debug)] +#[derive(Serialize, Debug, Deserialize)] pub struct GenericAPIError { pub error: String, } /// Response of the /status API -#[derive(Serialize, Debug)] +#[derive(Serialize, Debug, Deserialize)] #[serde(rename_all = "snake_case")] pub struct ComputeStatusResponse { pub start_time: DateTime, @@ -23,7 +23,7 @@ pub struct ComputeStatusResponse { pub error: Option, } -#[derive(Serialize)] +#[derive(Deserialize, Serialize)] #[serde(rename_all = "snake_case")] pub struct ComputeState { pub status: ComputeStatus, @@ -33,7 +33,7 @@ pub struct ComputeState { pub error: Option, } -#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)] +#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)] #[serde(rename_all = "snake_case")] pub enum ComputeStatus { // Spec wasn't provided at start, waiting for it to be diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 6072980ed8..4014774a7e 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -5,6 +5,7 @@ //! and connect it to the storage nodes. use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; +use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; /// String type alias representing Postgres identifier and @@ -14,7 +15,7 @@ pub type PgIdent = String; /// Cluster spec or configuration represented as an optional number of /// delta operations + final cluster state description. #[serde_as] -#[derive(Clone, Debug, Default, Deserialize)] +#[derive(Clone, Debug, Default, Deserialize, Serialize)] pub struct ComputeSpec { pub format_version: f32, @@ -26,9 +27,32 @@ pub struct ComputeSpec { pub cluster: Cluster, pub delta_operations: Option>, + // Information needed to connect to the storage layer. + // + // `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed. + // + // Depending on `mode`, this can be a primary read-write node, a read-only + // replica, or a read-only node pinned at an older LSN. + // `safekeeper_connstrings` must be set for a primary. + // + // For backwards compatibility, the control plane may leave out all of + // these, and instead set the "neon.tenant_id", "neon.timeline_id", + // etc. GUCs in cluster.settings. TODO: Once the control plane has been + // updated to fill these fields, we can make these non optional. + #[serde_as(as = "Option")] + pub tenant_id: Option, + #[serde_as(as = "Option")] + pub timeline_id: Option, + #[serde_as(as = "Option")] + pub pageserver_connstring: Option, + #[serde(default)] + pub safekeeper_connstrings: Vec, + #[serde(default)] pub mode: ComputeMode, + /// If set, 'storage_auth_token' is used as the password to authenticate to + /// the pageserver and safekeepers. pub storage_auth_token: Option, } @@ -47,13 +71,19 @@ pub enum ComputeMode { Replica, } -#[derive(Clone, Debug, Default, Deserialize)] +#[derive(Clone, Debug, Default, Deserialize, Serialize)] pub struct Cluster { - pub cluster_id: String, - pub name: String, + pub cluster_id: Option, + pub name: Option, pub state: Option, pub roles: Vec, pub databases: Vec, + + /// Desired contents of 'postgresql.conf' file. (The 'compute_ctl' + /// tool may add additional settings to the final file.) + pub postgresql_conf: Option, + + /// Additional settings that will be appended to the 'postgresql.conf' file. pub settings: GenericOptions, } @@ -63,7 +93,7 @@ pub struct Cluster { /// - DROP ROLE /// - ALTER ROLE name RENAME TO new_name /// - ALTER DATABASE name RENAME TO new_name -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct DeltaOp { pub action: String, pub name: PgIdent, @@ -72,7 +102,7 @@ pub struct DeltaOp { /// Rust representation of Postgres role info with only those fields /// that matter for us. -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct Role { pub name: PgIdent, pub encrypted_password: Option, @@ -81,7 +111,7 @@ pub struct Role { /// Rust representation of Postgres database info with only those fields /// that matter for us. -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct Database { pub name: PgIdent, pub owner: PgIdent, @@ -91,7 +121,7 @@ pub struct Database { /// Common type representing both SQL statement params with or without value, /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config /// options like `wal_level = logical`. -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct GenericOption { pub name: String, pub value: Option, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 9e3d7af351..df5f5896a1 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -18,7 +18,29 @@ use crate::reltag::RelTag; use anyhow::bail; use bytes::{BufMut, Bytes, BytesMut}; -/// A state of a tenant in pageserver's memory. +/// The state of a tenant in this pageserver. +/// +/// ```mermaid +/// stateDiagram-v2 +/// +/// [*] --> Loading: spawn_load() +/// [*] --> Attaching: spawn_attach() +/// +/// Loading --> Activating: activate() +/// Attaching --> Activating: activate() +/// Activating --> Active: infallible +/// +/// Loading --> Broken: load() failure +/// Attaching --> Broken: attach() failure +/// +/// Active --> Stopping: set_stopping(), part of shutdown & detach +/// Stopping --> Broken: late error in remove_tenant_from_memory +/// +/// Broken --> [*]: ignore / detach / shutdown +/// Stopping --> [*]: remove_from_memory complete +/// +/// Active --> Broken: cfg(testing)-only tenant break point +/// ``` #[derive( Clone, PartialEq, @@ -26,51 +48,73 @@ use bytes::{BufMut, Bytes, BytesMut}; serde::Serialize, serde::Deserialize, strum_macros::Display, - strum_macros::EnumString, strum_macros::EnumVariantNames, strum_macros::AsRefStr, strum_macros::IntoStaticStr, )] #[serde(tag = "slug", content = "data")] pub enum TenantState { - /// This tenant is being loaded from local disk + /// This tenant is being loaded from local disk. + /// + /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass. Loading, - /// This tenant is being downloaded from cloud storage. + /// This tenant is being attached to the pageserver. + /// + /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass. Attaching, - /// Tenant is fully operational + /// The tenant is transitioning from Loading/Attaching to Active. + /// + /// While in this state, the individual timelines are being activated. + /// + /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass. + Activating(ActivatingFrom), + /// The tenant has finished activating and is open for business. + /// + /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`. Active, - /// A tenant is recognized by pageserver, but it is being detached or the + /// The tenant is recognized by pageserver, but it is being detached or the /// system is being shut down. + /// + /// Transitions out of this state are possible through `set_broken()`. Stopping, - /// A tenant is recognized by the pageserver, but can no longer be used for - /// any operations, because it failed to be activated. + /// The tenant is recognized by the pageserver, but can no longer be used for + /// any operations. + /// + /// If the tenant fails to load or attach, it will transition to this state + /// and it is guaranteed that no background tasks are running in its name. + /// + /// The other way to transition into this state is from `Stopping` state + /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens + /// if the cleanup future executed by `remove_tenant_from_memory()` fails. Broken { reason: String, backtrace: String }, } impl TenantState { pub fn attachment_status(&self) -> TenantAttachmentStatus { use TenantAttachmentStatus::*; + + // Below TenantState::Activating is used as "transient" or "transparent" state for + // attachment_status determining. match self { // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map. // So, technically, we can return Attached here. // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check. // But, our attach task might still be fetching the remote timelines, etc. // So, return `Maybe` while Attaching, making Console wait for the attach task to finish. - Self::Attaching => Maybe, + Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe, // tenant mgr startup distinguishes attaching from loading via marker file. // If it's loading, there is no attach marker file, i.e., attach had finished in the past. - Self::Loading => Attached, + Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached, // We only reach Active after successful load / attach. // So, call atttachment status Attached. Self::Active => Attached, // If the (initial or resumed) attach procedure fails, the tenant becomes Broken. // However, it also becomes Broken if the regular load fails. - // We would need a separate TenantState variant to distinguish these cases. - // However, there's no practical difference from Console's perspective. - // It will run a Postgres-level health check as soon as it observes Attached. - // That will fail on Broken tenants. - // Console can then rollback the attach, or, wait for operator to fix the Broken tenant. - Self::Broken { .. } => Attached, + // From Console's perspective there's no practical difference + // because attachment_status is polled by console only during attach operation execution. + Self::Broken { reason, .. } => Failed { + reason: reason.to_owned(), + }, // Why is Stopping a Maybe case? Because, during pageserver shutdown, // we set the Stopping state irrespective of whether the tenant // has finished attaching or not. @@ -98,8 +142,17 @@ impl std::fmt::Debug for TenantState { } } +/// The only [`TenantState`] variants we could be `TenantState::Activating` from. +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum ActivatingFrom { + /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`] + Loading, + /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`] + Attaching, +} + /// A state of a timeline in pageserver's memory. -#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum TimelineState { /// The timeline is recognized by the pageserver but is not yet operational. /// In particular, the walreceiver connection loop is not running for this timeline. @@ -112,15 +165,14 @@ pub enum TimelineState { /// It cannot transition back into any other state. Stopping, /// The timeline is broken and not operational (previous states: Loading or Active). - Broken, + Broken { reason: String, backtrace: String }, } #[serde_as] #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { - #[serde(default)] - #[serde_as(as = "Option")] - pub new_timeline_id: Option, + #[serde_as(as = "DisplayFromStr")] + pub new_timeline_id: TimelineId, #[serde(default)] #[serde_as(as = "Option")] pub ancestor_timeline_id: Option, @@ -170,6 +222,7 @@ pub struct TenantConfig { pub eviction_policy: Option, pub min_resident_size_override: Option, pub evictions_low_residence_duration_metric_threshold: Option, + pub gc_feedback: Option, } #[serde_as] @@ -228,17 +281,41 @@ impl TenantConfigRequest { eviction_policy: None, min_resident_size_override: None, evictions_low_residence_duration_metric_threshold: None, + gc_feedback: None, }; TenantConfigRequest { tenant_id, config } } } +#[derive(Debug, Serialize, Deserialize)] +pub struct TenantAttachRequest { + pub config: TenantAttachConfig, +} + +/// Newtype to enforce deny_unknown_fields on TenantConfig for +/// its usage inside `TenantAttachRequest`. +#[derive(Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct TenantAttachConfig { + #[serde(flatten)] + allowing_unknown_fields: TenantConfig, +} + +impl std::ops::Deref for TenantAttachConfig { + type Target = TenantConfig; + + fn deref(&self) -> &Self::Target { + &self.allowing_unknown_fields + } +} + /// See [`TenantState::attachment_status`] and the OpenAPI docs for context. #[derive(Serialize, Deserialize, Clone)] -#[serde(rename_all = "snake_case")] +#[serde(tag = "slug", content = "data", rename_all = "snake_case")] pub enum TenantAttachmentStatus { Maybe, Attached, + Failed { reason: String }, } #[serde_as] @@ -732,7 +809,9 @@ mod tests { "slug": "Active", }, "current_physical_size": 42, - "attachment_status": "attached", + "attachment_status": { + "slug":"attached", + } }); let original_broken = TenantInfo { @@ -754,7 +833,9 @@ mod tests { } }, "current_physical_size": 42, - "attachment_status": "attached", + "attachment_status": { + "slug":"attached", + } }); assert_eq!( @@ -795,5 +876,68 @@ mod tests { "expect unknown field `unknown_field` error, got: {}", err ); + + let attach_request = json!({ + "config": { + "unknown_field": "unknown_value".to_string(), + }, + }); + let err = serde_json::from_value::(attach_request).unwrap_err(); + assert!( + err.to_string().contains("unknown field `unknown_field`"), + "expect unknown field `unknown_field` error, got: {}", + err + ); + } + + #[test] + fn tenantstatus_activating_serde() { + let states = [ + TenantState::Activating(ActivatingFrom::Loading), + TenantState::Activating(ActivatingFrom::Attaching), + ]; + let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]"; + + let actual = serde_json::to_string(&states).unwrap(); + + assert_eq!(actual, expected); + + let parsed = serde_json::from_str::>(&actual).unwrap(); + + assert_eq!(states.as_slice(), &parsed); + } + + #[test] + fn tenantstatus_activating_strum() { + // tests added, because we use these for metrics + let examples = [ + (line!(), TenantState::Loading, "Loading"), + (line!(), TenantState::Attaching, "Attaching"), + ( + line!(), + TenantState::Activating(ActivatingFrom::Loading), + "Activating", + ), + ( + line!(), + TenantState::Activating(ActivatingFrom::Attaching), + "Activating", + ), + (line!(), TenantState::Active, "Active"), + (line!(), TenantState::Stopping, "Stopping"), + ( + line!(), + TenantState::Broken { + reason: "Example".into(), + backtrace: "Looooong backtrace".into(), + }, + "Broken", + ), + ]; + + for (line, rendered, expected) in examples { + let actual: &'static str = rendered.into(); + assert_eq!(actual, expected, "example on {line}"); + } } } diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 159fc5946d..86e72f6bdd 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -24,7 +24,6 @@ workspace_hack.workspace = true [dev-dependencies] env_logger.workspace = true postgres.workspace = true -wal_craft = { path = "wal_craft" } [build-dependencies] anyhow.workspace = true diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index b8eb469cb0..cc115664d5 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -33,6 +33,7 @@ macro_rules! postgres_ffi { } pub mod controlfile_utils; pub mod nonrelfile_utils; + pub mod wal_craft_test_export; pub mod waldecoder_handler; pub mod xlog_utils; @@ -45,8 +46,15 @@ macro_rules! postgres_ffi { }; } -postgres_ffi!(v14); -postgres_ffi!(v15); +#[macro_export] +macro_rules! for_all_postgres_versions { + ($macro:tt) => { + $macro!(v14); + $macro!(v15); + }; +} + +for_all_postgres_versions! { postgres_ffi } pub mod pg_constants; pub mod relfile_utils; diff --git a/libs/postgres_ffi/src/wal_craft_test_export.rs b/libs/postgres_ffi/src/wal_craft_test_export.rs new file mode 100644 index 0000000000..147567c442 --- /dev/null +++ b/libs/postgres_ffi/src/wal_craft_test_export.rs @@ -0,0 +1,6 @@ +//! This module is for WAL craft to test with postgres_ffi. Should not import any thing in normal usage. + +pub use super::PG_MAJORVERSION; +pub use super::xlog_utils::*; +pub use super::bindings::*; +pub use crate::WAL_SEGMENT_SIZE; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 4d7bb61883..61a9c38a84 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -481,220 +481,4 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec { wal } -#[cfg(test)] -mod tests { - use super::super::PG_MAJORVERSION; - use super::*; - use regex::Regex; - use std::cmp::min; - use std::fs; - use std::{env, str::FromStr}; - use utils::const_assert; - - fn init_logging() { - let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or( - format!("wal_craft=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"), - )) - .is_test(true) - .try_init(); - } - - fn test_end_of_wal(test_name: &str) { - use wal_craft::*; - - let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); - - // Craft some WAL - let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join("..") - .join(".."); - let cfg = Conf { - pg_version, - pg_distrib_dir: top_path.join("pg_install"), - datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)), - }; - if cfg.datadir.exists() { - fs::remove_dir_all(&cfg.datadir).unwrap(); - } - cfg.initdb().unwrap(); - let srv = cfg.start_server().unwrap(); - let (intermediate_lsns, expected_end_of_wal_partial) = - C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap(); - let intermediate_lsns: Vec = intermediate_lsns - .iter() - .map(|&lsn| u64::from(lsn).into()) - .collect(); - let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into(); - srv.kill(); - - // Check find_end_of_wal on the initial WAL - let last_segment = cfg - .wal_dir() - .read_dir() - .unwrap() - .map(|f| f.unwrap().file_name().into_string().unwrap()) - .filter(|fname| IsXLogFileName(fname)) - .max() - .unwrap(); - check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal); - for start_lsn in intermediate_lsns - .iter() - .chain(std::iter::once(&expected_end_of_wal)) - { - // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`. - // We assume that `start_lsn` is non-decreasing. - info!( - "Checking with start_lsn={}, erasing WAL before it", - start_lsn - ); - for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() { - let fname = file.file_name().into_string().unwrap(); - if !IsXLogFileName(&fname) { - continue; - } - let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE); - let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); - if seg_start_lsn > u64::from(*start_lsn) { - continue; - } - let mut f = File::options().write(true).open(file.path()).unwrap(); - const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE]; - f.write_all( - &ZEROS[0..min( - WAL_SEGMENT_SIZE, - (u64::from(*start_lsn) - seg_start_lsn) as usize, - )], - ) - .unwrap(); - } - check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal); - } - } - - fn check_pg_waldump_end_of_wal( - cfg: &wal_craft::Conf, - last_segment: &str, - expected_end_of_wal: Lsn, - ) { - // Get the actual end of WAL by pg_waldump - let waldump_output = cfg - .pg_waldump("000000010000000000000001", last_segment) - .unwrap() - .stderr; - let waldump_output = std::str::from_utf8(&waldump_output).unwrap(); - let caps = match Regex::new(r"invalid record length at (.+):") - .unwrap() - .captures(waldump_output) - { - Some(caps) => caps, - None => { - error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output); - panic!(); - } - }; - let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap(); - info!( - "waldump erred on {}, expected wal end at {}", - waldump_wal_end, expected_end_of_wal - ); - assert_eq!(waldump_wal_end, expected_end_of_wal); - } - - fn check_end_of_wal( - cfg: &wal_craft::Conf, - last_segment: &str, - start_lsn: Lsn, - expected_end_of_wal: Lsn, - ) { - // Check end_of_wal on non-partial WAL segment (we treat it as fully populated) - // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap(); - // info!( - // "find_end_of_wal returned wal_end={} with non-partial WAL segment", - // wal_end - // ); - // assert_eq!(wal_end, expected_end_of_wal_non_partial); - - // Rename file to partial to actually find last valid lsn, then rename it back. - fs::rename( - cfg.wal_dir().join(last_segment), - cfg.wal_dir().join(format!("{}.partial", last_segment)), - ) - .unwrap(); - let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap(); - info!( - "find_end_of_wal returned wal_end={} with partial WAL segment", - wal_end - ); - assert_eq!(wal_end, expected_end_of_wal); - fs::rename( - cfg.wal_dir().join(format!("{}.partial", last_segment)), - cfg.wal_dir().join(last_segment), - ) - .unwrap(); - } - - const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024); - - #[test] - pub fn test_find_end_of_wal_simple() { - init_logging(); - test_end_of_wal::("test_find_end_of_wal_simple"); - } - - #[test] - pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() { - init_logging(); - test_end_of_wal::( - "test_find_end_of_wal_crossing_segment_followed_by_small_one", - ); - } - - #[test] - pub fn test_find_end_of_wal_last_crossing_segment() { - init_logging(); - test_end_of_wal::( - "test_find_end_of_wal_last_crossing_segment", - ); - } - - /// Check the math in update_next_xid - /// - /// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL, - /// currently 1024. - #[test] - pub fn test_update_next_xid() { - let checkpoint_buf = [0u8; std::mem::size_of::()]; - let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); - - checkpoint.nextXid = FullTransactionId { value: 10 }; - assert_eq!(checkpoint.nextXid.value, 10); - - // The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL - // boundary - checkpoint.update_next_xid(100); - assert_eq!(checkpoint.nextXid.value, 1024); - - // No change - checkpoint.update_next_xid(500); - assert_eq!(checkpoint.nextXid.value, 1024); - checkpoint.update_next_xid(1023); - assert_eq!(checkpoint.nextXid.value, 1024); - - // The function returns the *next* XID, given the highest XID seen so - // far. So when we pass 1024, the nextXid gets bumped up to the next - // XID_CHECKPOINT_INTERVAL boundary. - checkpoint.update_next_xid(1024); - assert_eq!(checkpoint.nextXid.value, 2048); - } - - #[test] - pub fn test_encode_logical_message() { - let expected = [ - 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, - 38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, - 101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, - ]; - let actual = encode_logical_message("prefix", "message"); - assert_eq!(expected, actual[..]); - } -} +// If you need to craft WAL and write tests for this module, put it at wal_craft crate. diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index 992bf7460b..bea888b23e 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -15,3 +15,7 @@ postgres_ffi.workspace = true tempfile.workspace = true workspace_hack.workspace = true + +[dev-dependencies] +regex.workspace = true +utils.workspace = true diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 9f3f4dc20d..d4aed88048 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -10,6 +10,20 @@ use std::process::Command; use std::time::{Duration, Instant}; use tempfile::{tempdir, TempDir}; +macro_rules! xlog_utils_test { + ($version:ident) => { + #[path = "."] + mod $version { + pub use postgres_ffi::$version::wal_craft_test_export::*; + #[allow(clippy::duplicate_mod)] + #[cfg(test)] + mod xlog_utils_test; + } + }; +} + +postgres_ffi::for_all_postgres_versions! { xlog_utils_test } + #[derive(Debug, Clone, PartialEq, Eq)] pub struct Conf { pub pg_version: u32, diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs new file mode 100644 index 0000000000..6ff4c563b2 --- /dev/null +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -0,0 +1,219 @@ +//! Tests for postgres_ffi xlog_utils module. Put it here to break cyclic dependency. + +use super::*; +use crate::{error, info}; +use regex::Regex; +use std::cmp::min; +use std::fs::{self, File}; +use std::io::Write; +use std::{env, str::FromStr}; +use utils::const_assert; +use utils::lsn::Lsn; + +fn init_logging() { + let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or( + format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"), + )) + .is_test(true) + .try_init(); +} + +fn test_end_of_wal(test_name: &str) { + use crate::*; + + let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + + // Craft some WAL + let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("..") + .join(".."); + let cfg = Conf { + pg_version, + pg_distrib_dir: top_path.join("pg_install"), + datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)), + }; + if cfg.datadir.exists() { + fs::remove_dir_all(&cfg.datadir).unwrap(); + } + cfg.initdb().unwrap(); + let srv = cfg.start_server().unwrap(); + let (intermediate_lsns, expected_end_of_wal_partial) = + C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap(); + let intermediate_lsns: Vec = intermediate_lsns + .iter() + .map(|&lsn| u64::from(lsn).into()) + .collect(); + let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into(); + srv.kill(); + + // Check find_end_of_wal on the initial WAL + let last_segment = cfg + .wal_dir() + .read_dir() + .unwrap() + .map(|f| f.unwrap().file_name().into_string().unwrap()) + .filter(|fname| IsXLogFileName(fname)) + .max() + .unwrap(); + check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal); + for start_lsn in intermediate_lsns + .iter() + .chain(std::iter::once(&expected_end_of_wal)) + { + // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`. + // We assume that `start_lsn` is non-decreasing. + info!( + "Checking with start_lsn={}, erasing WAL before it", + start_lsn + ); + for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() { + let fname = file.file_name().into_string().unwrap(); + if !IsXLogFileName(&fname) { + continue; + } + let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE); + let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); + if seg_start_lsn > u64::from(*start_lsn) { + continue; + } + let mut f = File::options().write(true).open(file.path()).unwrap(); + const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE]; + f.write_all( + &ZEROS[0..min( + WAL_SEGMENT_SIZE, + (u64::from(*start_lsn) - seg_start_lsn) as usize, + )], + ) + .unwrap(); + } + check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal); + } +} + +fn check_pg_waldump_end_of_wal( + cfg: &crate::Conf, + last_segment: &str, + expected_end_of_wal: Lsn, +) { + // Get the actual end of WAL by pg_waldump + let waldump_output = cfg + .pg_waldump("000000010000000000000001", last_segment) + .unwrap() + .stderr; + let waldump_output = std::str::from_utf8(&waldump_output).unwrap(); + let caps = match Regex::new(r"invalid record length at (.+):") + .unwrap() + .captures(waldump_output) + { + Some(caps) => caps, + None => { + error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output); + panic!(); + } + }; + let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap(); + info!( + "waldump erred on {}, expected wal end at {}", + waldump_wal_end, expected_end_of_wal + ); + assert_eq!(waldump_wal_end, expected_end_of_wal); +} + +fn check_end_of_wal( + cfg: &crate::Conf, + last_segment: &str, + start_lsn: Lsn, + expected_end_of_wal: Lsn, +) { + // Check end_of_wal on non-partial WAL segment (we treat it as fully populated) + // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap(); + // info!( + // "find_end_of_wal returned wal_end={} with non-partial WAL segment", + // wal_end + // ); + // assert_eq!(wal_end, expected_end_of_wal_non_partial); + + // Rename file to partial to actually find last valid lsn, then rename it back. + fs::rename( + cfg.wal_dir().join(last_segment), + cfg.wal_dir().join(format!("{}.partial", last_segment)), + ) + .unwrap(); + let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap(); + info!( + "find_end_of_wal returned wal_end={} with partial WAL segment", + wal_end + ); + assert_eq!(wal_end, expected_end_of_wal); + fs::rename( + cfg.wal_dir().join(format!("{}.partial", last_segment)), + cfg.wal_dir().join(last_segment), + ) + .unwrap(); +} + +const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024); + +#[test] +pub fn test_find_end_of_wal_simple() { + init_logging(); + test_end_of_wal::("test_find_end_of_wal_simple"); +} + +#[test] +pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() { + init_logging(); + test_end_of_wal::( + "test_find_end_of_wal_crossing_segment_followed_by_small_one", + ); +} + +#[test] +pub fn test_find_end_of_wal_last_crossing_segment() { + init_logging(); + test_end_of_wal::( + "test_find_end_of_wal_last_crossing_segment", + ); +} + +/// Check the math in update_next_xid +/// +/// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL, +/// currently 1024. +#[test] +pub fn test_update_next_xid() { + let checkpoint_buf = [0u8; std::mem::size_of::()]; + let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); + + checkpoint.nextXid = FullTransactionId { value: 10 }; + assert_eq!(checkpoint.nextXid.value, 10); + + // The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL + // boundary + checkpoint.update_next_xid(100); + assert_eq!(checkpoint.nextXid.value, 1024); + + // No change + checkpoint.update_next_xid(500); + assert_eq!(checkpoint.nextXid.value, 1024); + checkpoint.update_next_xid(1023); + assert_eq!(checkpoint.nextXid.value, 1024); + + // The function returns the *next* XID, given the highest XID seen so + // far. So when we pass 1024, the nextXid gets bumped up to the next + // XID_CHECKPOINT_INTERVAL boundary. + checkpoint.update_next_xid(1024); + assert_eq!(checkpoint.nextXid.value, 2048); +} + +#[test] +pub fn test_encode_logical_message() { + let expected = [ + 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, + 38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, + 101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, + ]; + let actual = encode_logical_message("prefix", "message"); + assert_eq!(expected, actual[..]); +} diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index e0cc3ca543..ac1f8a357e 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -111,6 +111,8 @@ pub trait RemoteStorage: Send + Sync + 'static { ) -> Result; async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>; + + async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>; } pub struct Download { @@ -223,6 +225,14 @@ impl GenericRemoteStorage { Self::Unreliable(s) => s.delete(path).await, } } + + pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + match self { + Self::LocalFs(s) => s.delete_objects(paths).await, + Self::AwsS3(s) => s.delete_objects(paths).await, + Self::Unreliable(s) => s.delete_objects(paths).await, + } + } } impl GenericRemoteStorage { diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index c081a6d361..59304c2481 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -17,7 +17,7 @@ use tokio::{ io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, }; use tracing::*; -use utils::crashsafe::path_with_suffix_extension; +use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty}; use crate::{Download, DownloadError, RemotePath}; @@ -101,19 +101,35 @@ impl RemoteStorage for LocalFs { Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)), None => Cow::Borrowed(&self.storage_root), }; - Ok(get_all_files(path.as_ref(), false) + + let prefixes_to_filter = get_all_files(path.as_ref(), false) .await - .map_err(DownloadError::Other)? - .into_iter() - .map(|path| { - path.strip_prefix(&self.storage_root) - .context("Failed to strip preifix") + .map_err(DownloadError::Other)?; + + let mut prefixes = Vec::with_capacity(prefixes_to_filter.len()); + + // filter out empty directories to mirror s3 behavior. + for prefix in prefixes_to_filter { + if prefix.is_dir() + && is_directory_empty(&prefix) + .await + .map_err(DownloadError::Other)? + { + continue; + } + + prefixes.push( + prefix + .strip_prefix(&self.storage_root) + .context("Failed to strip prefix") .and_then(RemotePath::new) .expect( "We list files for storage root, hence should be able to remote the prefix", - ) - }) - .collect()) + ), + ) + } + + Ok(prefixes) } async fn upload( @@ -291,11 +307,25 @@ impl RemoteStorage for LocalFs { async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { let file_path = path.with_base(&self.storage_root); - if file_path.exists() && file_path.is_file() { - Ok(fs::remove_file(file_path).await?) - } else { - bail!("File {file_path:?} either does not exist or is not a file") + if !file_path.exists() { + // See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html + // > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful. + return Ok(()); } + + if !file_path.is_file() { + anyhow::bail!("{file_path:?} is not a file"); + } + Ok(fs::remove_file(file_path) + .await + .map_err(|e| anyhow::anyhow!(e))?) + } + + async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + for path in paths { + self.delete(path).await? + } + Ok(()) } } @@ -320,7 +350,7 @@ where let file_type = dir_entry.file_type().await?; let entry_path = dir_entry.path(); if file_type.is_symlink() { - debug!("{entry_path:?} us a symlink, skipping") + debug!("{entry_path:?} is a symlink, skipping") } else if file_type.is_dir() { if recursive { paths.extend(get_all_files(&entry_path, true).await?.into_iter()) @@ -595,15 +625,11 @@ mod fs_tests { storage.delete(&upload_target).await?; assert!(storage.list().await?.is_empty()); - match storage.delete(&upload_target).await { - Ok(()) => panic!("Should not allow deleting non-existing storage files"), - Err(e) => { - let error_string = e.to_string(); - assert!(error_string.contains("does not exist")); - let expected_path = upload_target.with_base(&storage.storage_root); - assert!(error_string.contains(expected_path.to_str().unwrap())); - } - } + storage + .delete(&upload_target) + .await + .expect("Should allow deleting non-existing storage files"); + Ok(()) } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 0be8c72fe0..38e1bf00f8 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -17,6 +17,7 @@ use aws_sdk_s3::{ error::SdkError, operation::get_object::GetObjectError, primitives::ByteStream, + types::{Delete, ObjectIdentifier}, Client, }; use aws_smithy_http::body::SdkBody; @@ -81,12 +82,24 @@ pub(super) mod metrics { .inc(); } + pub fn inc_delete_objects(count: u64) { + S3_REQUESTS_COUNT + .with_label_values(&["delete_object"]) + .inc_by(count); + } + pub fn inc_delete_object_fail() { S3_REQUESTS_FAIL_COUNT .with_label_values(&["delete_object"]) .inc(); } + pub fn inc_delete_objects_fail(count: u64) { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["delete_object"]) + .inc_by(count); + } + pub fn inc_list_objects() { S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc(); } @@ -396,6 +409,34 @@ impl RemoteStorage for S3Bucket { }) .await } + async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 delete")?; + + let mut delete_objects = Vec::with_capacity(paths.len()); + for path in paths { + let obj_id = ObjectIdentifier::builder() + .set_key(Some(self.relative_path_to_s3_object(path))) + .build(); + delete_objects.push(obj_id); + } + + metrics::inc_delete_objects(paths.len() as u64); + self.client + .delete_objects() + .bucket(self.bucket_name.clone()) + .delete(Delete::builder().set_objects(Some(delete_objects)).build()) + .send() + .await + .map_err(|e| { + metrics::inc_delete_objects_fail(paths.len() as u64); + e + })?; + Ok(()) + } async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { let _guard = self diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index cb40859831..2f341bb29d 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -119,4 +119,11 @@ impl RemoteStorage for UnreliableWrapper { self.attempt(RemoteOp::Delete(path.clone()))?; self.inner.delete(path).await } + + async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + for path in paths { + self.delete(path).await? + } + Ok(()) + } } diff --git a/libs/remote_storage/tests/pagination_tests.rs b/libs/remote_storage/tests/test_real_s3.rs similarity index 74% rename from libs/remote_storage/tests/pagination_tests.rs rename to libs/remote_storage/tests/test_real_s3.rs index 86a6888f98..5f52b0754c 100644 --- a/libs/remote_storage/tests/pagination_tests.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -7,6 +7,7 @@ use std::sync::Arc; use std::time::UNIX_EPOCH; use anyhow::Context; +use once_cell::sync::OnceCell; use remote_storage::{ GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, }; @@ -14,8 +15,12 @@ use test_context::{test_context, AsyncTestContext}; use tokio::task::JoinSet; use tracing::{debug, error, info}; +static LOGGING_DONE: OnceCell<()> = OnceCell::new(); + const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; +const BASE_PREFIX: &str = "test/"; + /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries. /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. /// See the client creation in [`create_s3_client`] for details on the required env vars. @@ -38,20 +43,20 @@ const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_ /// /// Lastly, the test attempts to clean up and remove all uploaded S3 files. /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished. -#[test_context(MaybeEnabledS3)] +#[test_context(MaybeEnabledS3WithTestBlobs)] #[tokio::test] -async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> { +async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> { let ctx = match ctx { - MaybeEnabledS3::Enabled(ctx) => ctx, - MaybeEnabledS3::Disabled => return Ok(()), - MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"), + MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx, + MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()), + MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"), }; - let test_client = Arc::clone(&ctx.client_with_excessive_pagination); + let test_client = Arc::clone(&ctx.enabled.client); let expected_remote_prefixes = ctx.remote_prefixes.clone(); - let base_prefix = - RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?; + let base_prefix = RemotePath::new(Path::new(ctx.enabled.base_prefix)) + .context("common_prefix construction")?; let root_remote_prefixes = test_client .list_prefixes(None) .await @@ -83,27 +88,122 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<( Ok(()) } +#[test_context(MaybeEnabledS3)] +#[tokio::test] +async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledS3::Enabled(ctx) => ctx, + MaybeEnabledS3::Disabled => return Ok(()), + }; + + let path = RemotePath::new(&PathBuf::from(format!( + "{}/for_sure_there_is_nothing_there_really", + ctx.base_prefix, + ))) + .with_context(|| "RemotePath conversion")?; + + ctx.client.delete(&path).await.expect("should succeed"); + + Ok(()) +} + +#[test_context(MaybeEnabledS3)] +#[tokio::test] +async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledS3::Enabled(ctx) => ctx, + MaybeEnabledS3::Disabled => return Ok(()), + }; + + let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,))) + .with_context(|| "RemotePath conversion")?; + + let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,))) + .with_context(|| "RemotePath conversion")?; + + let data1 = "remote blob data1".as_bytes(); + let data1_len = data1.len(); + let data2 = "remote blob data2".as_bytes(); + let data2_len = data2.len(); + ctx.client + .upload(std::io::Cursor::new(data1), data1_len, &path1, None) + .await?; + + ctx.client + .upload(std::io::Cursor::new(data2), data2_len, &path2, None) + .await?; + + ctx.client.delete_objects(&[path1, path2]).await?; + + Ok(()) +} + +fn ensure_logging_ready() { + LOGGING_DONE.get_or_init(|| { + utils::logging::init( + utils::logging::LogFormat::Test, + utils::logging::TracingErrorLayerEnablement::Disabled, + ) + .expect("logging init failed"); + }); +} + +struct EnabledS3 { + client: Arc, + base_prefix: &'static str, +} + +impl EnabledS3 { + async fn setup(max_keys_in_list_response: Option) -> Self { + let client = create_s3_client(max_keys_in_list_response) + .context("S3 client creation") + .expect("S3 client creation failed"); + + EnabledS3 { + client, + base_prefix: BASE_PREFIX, + } + } +} + enum MaybeEnabledS3 { + Enabled(EnabledS3), + Disabled, +} + +#[async_trait::async_trait] +impl AsyncTestContext for MaybeEnabledS3 { + async fn setup() -> Self { + ensure_logging_ready(); + + if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { + info!( + "`{}` env variable is not set, skipping the test", + ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME + ); + return Self::Disabled; + } + + Self::Enabled(EnabledS3::setup(None).await) + } +} + +enum MaybeEnabledS3WithTestBlobs { Enabled(S3WithTestBlobs), Disabled, UploadsFailed(anyhow::Error, S3WithTestBlobs), } struct S3WithTestBlobs { - client_with_excessive_pagination: Arc, - base_prefix_str: &'static str, + enabled: EnabledS3, remote_prefixes: HashSet, remote_blobs: HashSet, } #[async_trait::async_trait] -impl AsyncTestContext for MaybeEnabledS3 { +impl AsyncTestContext for MaybeEnabledS3WithTestBlobs { async fn setup() -> Self { - utils::logging::init( - utils::logging::LogFormat::Test, - utils::logging::TracingErrorLayerEnablement::Disabled, - ) - .expect("logging init failed"); + ensure_logging_ready(); if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { info!( "`{}` env variable is not set, skipping the test", @@ -115,23 +215,14 @@ impl AsyncTestContext for MaybeEnabledS3 { let max_keys_in_list_response = 10; let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap()); - let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response) - .context("S3 client creation") - .expect("S3 client creation failed"); + let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await; - let base_prefix_str = "test/"; - match upload_s3_data( - &client_with_excessive_pagination, - base_prefix_str, - upload_tasks_count, - ) - .await - { + match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await { ControlFlow::Continue(uploads) => { info!("Remote objects created successfully"); + Self::Enabled(S3WithTestBlobs { - client_with_excessive_pagination, - base_prefix_str, + enabled, remote_prefixes: uploads.prefixes, remote_blobs: uploads.blobs, }) @@ -139,8 +230,7 @@ impl AsyncTestContext for MaybeEnabledS3 { ControlFlow::Break(uploads) => Self::UploadsFailed( anyhow::anyhow!("One or multiple blobs failed to upload to S3"), S3WithTestBlobs { - client_with_excessive_pagination, - base_prefix_str, + enabled, remote_prefixes: uploads.prefixes, remote_blobs: uploads.blobs, }, @@ -152,13 +242,15 @@ impl AsyncTestContext for MaybeEnabledS3 { match self { Self::Disabled => {} Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => { - cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await; + cleanup(&ctx.enabled.client, ctx.remote_blobs).await; } } } } -fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result> { +fn create_s3_client( + max_keys_per_list_response: Option, +) -> anyhow::Result> { let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET") .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?; let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION") @@ -176,7 +268,7 @@ fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result); + +/// Barrier will wait until all clones of [`Completion`] have been dropped. +#[derive(Clone)] +pub struct Barrier(Arc>>); + +impl Barrier { + pub async fn wait(self) { + self.0.lock().await.recv().await; + } + + pub async fn maybe_wait(barrier: Option) { + if let Some(b) = barrier { + b.wait().await + } + } +} + +/// Create new Guard and Barrier pair. +pub fn channel() -> (Completion, Barrier) { + let (tx, rx) = mpsc::channel::<()>(1); + let rx = Mutex::new(rx); + let rx = Arc::new(rx); + (Completion(tx), Barrier(rx)) +} diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs index d2cb7be816..0ef0464267 100644 --- a/libs/utils/src/fs_ext.rs +++ b/libs/utils/src/fs_ext.rs @@ -1,6 +1,8 @@ /// Extensions to `std::fs` types. use std::{fs, io, path::Path}; +use anyhow::Context; + pub trait PathExt { /// Returns an error if `self` is not a directory. fn is_empty_dir(&self) -> io::Result; @@ -15,10 +17,19 @@ where } } +pub async fn is_directory_empty(path: impl AsRef) -> anyhow::Result { + let mut dir = tokio::fs::read_dir(&path) + .await + .context(format!("read_dir({})", path.as_ref().display()))?; + Ok(dir.next_entry().await?.is_none()) +} + #[cfg(test)] mod test { use std::path::PathBuf; + use crate::fs_ext::is_directory_empty; + #[test] fn is_empty_dir() { use super::PathExt; @@ -42,4 +53,26 @@ mod test { std::fs::remove_file(&file_path).unwrap(); assert!(file_path.is_empty_dir().is_err()); } + + #[tokio::test] + async fn is_empty_dir_async() { + let dir = tempfile::tempdir().unwrap(); + let dir_path = dir.path(); + + // test positive case + assert!( + is_directory_empty(dir_path).await.expect("test failure"), + "new tempdir should be empty" + ); + + // invoke on a file to ensure it returns an error + let file_path: PathBuf = dir_path.join("testfile"); + let f = std::fs::File::create(&file_path).unwrap(); + drop(f); + assert!(is_directory_empty(&file_path).await.is_err()); + + // do it again on a path, we know to be nonexistent + std::fs::remove_file(&file_path).unwrap(); + assert!(is_directory_empty(file_path).await.is_err()); + } } diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 4bfb5bf994..33241dbdf7 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -1,23 +1,20 @@ use crate::auth::{Claims, JwtAuth}; -use crate::http::error; -use anyhow::{anyhow, Context}; +use crate::http::error::{api_error_handler, route_error_handler, ApiError}; +use anyhow::Context; use hyper::header::{HeaderName, AUTHORIZATION}; use hyper::http::HeaderValue; use hyper::Method; -use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server}; +use hyper::{header::CONTENT_TYPE, Body, Request, Response}; use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; use routerify::ext::RequestExt; -use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService}; +use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; use tokio::task::JoinError; use tracing::{self, debug, info, info_span, warn, Instrument}; use std::future::Future; -use std::net::TcpListener; use std::str::FromStr; -use super::error::ApiError; - static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { register_int_counter!( "libmetrics_metric_handler_requests_total", @@ -35,8 +32,18 @@ struct RequestId(String); /// Adds a tracing info_span! instrumentation around the handler events, /// logs the request start and end events for non-GET requests and non-200 responses. /// +/// Usage: Replace `my_handler` with `|r| request_span(r, my_handler)` +/// /// Use this to distinguish between logs of different HTTP requests: every request handler wrapped -/// in this type will get request info logged in the wrapping span, including the unique request ID. +/// with this will get request info logged in the wrapping span, including the unique request ID. +/// +/// This also handles errors, logging them and converting them to an HTTP error response. +/// +/// NB: If the client disconnects, Hyper will drop the Future, without polling it to +/// completion. In other words, the handler must be async cancellation safe! request_span +/// prints a warning to the log when that happens, so that you have some trace of it in +/// the log. +/// /// /// There could be other ways to implement similar functionality: /// @@ -54,60 +61,56 @@ struct RequestId(String); /// tries to achive with its `.instrument` used in the current approach. /// /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced. -pub struct RequestSpan(pub H) +pub async fn request_span(request: Request, handler: H) -> R::Output where - E: Into> + 'static, - R: Future, E>> + Send + 'static, - H: Fn(Request) -> R + Send + Sync + 'static; - -impl RequestSpan -where - E: Into> + 'static, - R: Future, E>> + Send + 'static, - H: Fn(Request) -> R + Send + Sync + 'static, + R: Future, ApiError>> + Send + 'static, + H: FnOnce(Request) -> R + Send + Sync + 'static, { - /// Creates a tracing span around inner request handler and executes the request handler in the contex of that span. - /// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled. - pub async fn handle(self, request: Request) -> Result, E> { - let request_id = request.context::().unwrap_or_default().0; - let method = request.method(); - let path = request.uri().path(); - let request_span = info_span!("request", %method, %path, %request_id); + let request_id = request.context::().unwrap_or_default().0; + let method = request.method(); + let path = request.uri().path(); + let request_span = info_span!("request", %method, %path, %request_id); - let log_quietly = method == Method::GET; - async move { - let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding(); - if log_quietly { - debug!("Handling request"); - } else { - info!("Handling request"); - } - - // Note that we reuse `error::handler` here and not returning and error at all, - // yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation. - // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call. - // - // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally. - let res = (self.0)(request).await; - - cancellation_guard.disarm(); - - match res { - Ok(response) => { - let response_status = response.status(); - if log_quietly && response_status.is_success() { - debug!("Request handled, status: {response_status}"); - } else { - info!("Request handled, status: {response_status}"); - } - Ok(response) - } - Err(e) => Ok(error::handler(e.into()).await), - } + let log_quietly = method == Method::GET; + async move { + let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding(); + if log_quietly { + debug!("Handling request"); + } else { + info!("Handling request"); + } + + // No special handling for panics here. There's a `tracing_panic_hook` from another + // module to do that globally. + let res = handler(request).await; + + cancellation_guard.disarm(); + + // Log the result if needed. + // + // We also convert any errors into an Ok response with HTTP error code here. + // `make_router` sets a last-resort error handler that would do the same, but + // we prefer to do it here, before we exit the request span, so that the error + // is still logged with the span. + // + // (Because we convert errors to Ok response, we never actually return an error, + // and we could declare the function to return the never type (`!`). However, + // using `routerify::RouterBuilder` requires a proper error type.) + match res { + Ok(response) => { + let response_status = response.status(); + if log_quietly && response_status.is_success() { + debug!("Request handled, status: {response_status}"); + } else { + info!("Request handled, status: {response_status}"); + } + Ok(response) + } + Err(err) => Ok(api_error_handler(err)), } - .instrument(request_span) - .await } + .instrument(request_span) + .await } /// Drop guard to WARN in case the request was dropped before completion. @@ -207,10 +210,8 @@ pub fn make_router() -> RouterBuilder { .middleware(Middleware::post_with_info( add_request_id_header_to_response, )) - .get("/metrics", |r| { - RequestSpan(prometheus_metrics_handler).handle(r) - }) - .err_handler(error::handler) + .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .err_handler(route_error_handler) } pub fn attach_openapi_ui( @@ -220,12 +221,14 @@ pub fn attach_openapi_ui( ui_mount_path: &'static str, ) -> RouterBuilder { router_builder - .get(spec_mount_path, move |r| { - RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) }) - .handle(r) - }) - .get(ui_mount_path, move |r| RequestSpan( move |_| async move { - Ok(Response::builder().body(Body::from(format!(r#" + .get(spec_mount_path, + move |r| request_span(r, move |_| async move { + Ok(Response::builder().body(Body::from(spec)).unwrap()) + }) + ) + .get(ui_mount_path, + move |r| request_span(r, move |_| async move { + Ok(Response::builder().body(Body::from(format!(r#" @@ -255,7 +258,8 @@ pub fn attach_openapi_ui( "#, spec_mount_path))).unwrap()) - }).handle(r)) + }) + ) } fn parse_token(header_value: &str) -> Result<&str, ApiError> { @@ -343,40 +347,6 @@ pub fn check_permission_with( } } -/// -/// Start listening for HTTP requests on given socket. -/// -/// 'shutdown_future' can be used to stop. If the Future becomes -/// ready, we stop listening for new requests, and the function returns. -/// -pub fn serve_thread_main( - router_builder: RouterBuilder, - listener: TcpListener, - shutdown_future: S, -) -> anyhow::Result<()> -where - S: Future + Send + Sync, -{ - info!("Starting an HTTP endpoint at {}", listener.local_addr()?); - - // Create a Service from the router above to handle incoming requests. - let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap(); - - // Enter a single-threaded tokio runtime bound to the current thread - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?; - - let _guard = runtime.enter(); - - let server = Server::from_tcp(listener)? - .serve(service) - .with_graceful_shutdown(shutdown_future); - - runtime.block_on(server)?; - - Ok(()) -} #[cfg(test)] mod tests { use super::*; diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index 3c6023eb80..f9c06453df 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -21,7 +21,7 @@ pub enum ApiError { Conflict(String), #[error("Precondition failed: {0}")] - PreconditionFailed(&'static str), + PreconditionFailed(Box), #[error(transparent)] InternalServerError(anyhow::Error), @@ -83,13 +83,24 @@ impl HttpErrorBody { } } -pub async fn handler(err: routerify::RouteError) -> Response { - let api_error = err - .downcast::() - .expect("handler should always return api error"); +pub async fn route_error_handler(err: routerify::RouteError) -> Response { + match err.downcast::() { + Ok(api_error) => api_error_handler(*api_error), + Err(other_error) => { + // We expect all the request handlers to return an ApiError, so this should + // not be reached. But just in case. + error!("Error processing HTTP request: {other_error:?}"); + HttpErrorBody::response_from_msg_and_status( + other_error.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ) + } + } +} +pub fn api_error_handler(api_error: ApiError) -> Response { // Print a stack trace for Internal Server errors - if let ApiError::InternalServerError(_) = api_error.as_ref() { + if let ApiError::InternalServerError(_) = api_error { error!("Error processing HTTP request: {api_error:?}"); } else { error!("Error processing HTTP request: {api_error:#}"); diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs index 8981fdd1dd..9c153033cb 100644 --- a/libs/utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -8,12 +8,26 @@ use super::error::ApiError; pub async fn json_request Deserialize<'de>>( request: &mut Request, ) -> Result { - let whole_body = hyper::body::aggregate(request.body_mut()) + json_request_or_empty_body(request) + .await? + .context("missing request body") + .map_err(ApiError::BadRequest) +} + +/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282 +pub async fn json_request_or_empty_body Deserialize<'de>>( + request: &mut Request, +) -> Result, ApiError> { + let body = hyper::body::aggregate(request.body_mut()) .await .context("Failed to read request body") .map_err(ApiError::BadRequest)?; - serde_json::from_reader(whole_body.reader()) + if body.remaining() == 0 { + return Ok(None); + } + serde_json::from_reader(body.reader()) .context("Failed to parse json request") + .map(Some) .map_err(ApiError::BadRequest) } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 4e4f79ab6b..69d3a1b9f2 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -60,6 +60,9 @@ pub mod tracing_span_assert; pub mod rate_limit; +/// Simple once-barrier and a guard which keeps barrier awaiting. +pub mod completion; + mod failpoint_macro_helpers { /// use with fail::cfg("$name", "return(2000)") diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index ee5980212e..45dc9fad4a 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { min_lsn = min(min_lsn, lsn_range.start); max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1)); - updates.insert_historic(Arc::new(layer)); + updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer)); } println!("min: {min_lsn}, max: {max_lsn}"); @@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) { is_incremental: false, short_id: format!("Layer {}", i), }; - updates.insert_historic(Arc::new(layer)); + updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer)); } updates.flush(); println!("Finished layer map init in {:?}", now.elapsed()); diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml new file mode 100644 index 0000000000..89e0d0486e --- /dev/null +++ b/pageserver/ctl/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "pagectl" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow.workspace = true +bytes.workspace = true +clap = { workspace = true, features = ["string"] } +git-version.workspace = true +pageserver = { path = ".." } +postgres_ffi.workspace = true +utils.workspace = true +svg_fmt.workspace = true +workspace_hack.workspace = true diff --git a/pageserver/src/bin/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs similarity index 97% rename from pageserver/src/bin/draw_timeline_dir.rs rename to pageserver/ctl/src/draw_timeline_dir.rs index da13ee452c..bfde5ba054 100644 --- a/pageserver/src/bin/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -12,7 +12,7 @@ //! Example use: //! ``` //! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ -//! $ grep "__" | cargo run --release --bin draw_timeline_dir > out.svg +//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg //! $ firefox out.svg //! ``` //! @@ -62,7 +62,7 @@ fn parse_filename(name: &str) -> (Range, Range) { (keys, lsns) } -fn main() -> Result<()> { +pub fn main() -> Result<()> { // Parse layer filenames from stdin let mut ranges: Vec<(Range, Range)> = vec![]; let stdin = io::stdin(); diff --git a/pageserver/src/bin/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs similarity index 92% rename from pageserver/src/bin/layer_map_analyzer.rs rename to pageserver/ctl/src/layer_map_analyzer.rs index e740879458..f2ced6154f 100644 --- a/pageserver/src/bin/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -6,7 +6,7 @@ use anyhow::Result; use std::cmp::Ordering; use std::collections::BinaryHeap; use std::ops::Range; -use std::{env, fs, path::Path, path::PathBuf, str, str::FromStr}; +use std::{fs, path::Path, str}; use pageserver::page_cache::PAGE_SZ; use pageserver::repository::{Key, KEY_SIZE}; @@ -18,12 +18,14 @@ use pageserver::virtual_file::VirtualFile; use utils::{bin_ser::BeSer, lsn::Lsn}; +use crate::AnalyzeLayerMapCmd; + const MIN_HOLE_LENGTH: i128 = (128 * 1024 * 1024 / PAGE_SZ) as i128; const DEFAULT_MAX_HOLES: usize = 10; /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap #[derive(PartialEq, Eq)] -struct Hole(Range); +pub struct Hole(Range); impl Ord for Hole { fn cmp(&self, other: &Self) -> Ordering { @@ -39,11 +41,11 @@ impl PartialOrd for Hole { } } -struct LayerFile { - key_range: Range, - lsn_range: Range, - is_delta: bool, - holes: Vec, +pub(crate) struct LayerFile { + pub key_range: Range, + pub lsn_range: Range, + pub is_delta: bool, + pub holes: Vec, } impl LayerFile { @@ -67,7 +69,7 @@ impl LayerFile { } } -fn parse_filename(name: &str) -> Option { +pub(crate) fn parse_filename(name: &str) -> Option { let split: Vec<&str> = name.split("__").collect(); if split.len() != 2 { return None; @@ -127,18 +129,9 @@ fn get_holes(path: &Path, max_holes: usize) -> Result> { Ok(holes) } -fn main() -> Result<()> { - let args: Vec = env::args().collect(); - if args.len() < 2 { - println!("Usage: layer_map_analyzer PAGESERVER_DATA_DIR [MAX_HOLES]"); - return Ok(()); - } - let storage_path = PathBuf::from_str(&args[1])?; - let max_holes = if args.len() > 2 { - args[2].parse::().unwrap() - } else { - DEFAULT_MAX_HOLES - }; +pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { + let storage_path = &cmd.path; + let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. pageserver::virtual_file::init(10); diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs new file mode 100644 index 0000000000..d77cf0908c --- /dev/null +++ b/pageserver/ctl/src/layers.rs @@ -0,0 +1,169 @@ +use std::path::{Path, PathBuf}; + +use anyhow::Result; +use clap::Subcommand; +use pageserver::tenant::block_io::BlockCursor; +use pageserver::tenant::disk_btree::DiskBtreeReader; +use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary}; +use pageserver::{page_cache, virtual_file}; +use pageserver::{ + repository::{Key, KEY_SIZE}, + tenant::{ + block_io::FileBlockReader, disk_btree::VisitDirection, + storage_layer::delta_layer::DELTA_KEY_SIZE, + }, + virtual_file::VirtualFile, +}; +use std::fs; +use utils::bin_ser::BeSer; + +use crate::layer_map_analyzer::parse_filename; + +#[derive(Subcommand)] +pub(crate) enum LayerCmd { + /// List all tenants and timelines under the pageserver path + /// + /// Example: `cargo run --bin pagectl layer list .neon/` + List { path: PathBuf }, + /// List all layers of a given tenant and timeline + /// + /// Example: `cargo run --bin pagectl layer list .neon/` + ListLayer { + path: PathBuf, + tenant: String, + timeline: String, + }, + /// Dump all information of a layer file + DumpLayer { + path: PathBuf, + tenant: String, + timeline: String, + /// The id from list-layer command + id: usize, + }, +} + +fn read_delta_file(path: impl AsRef) -> Result<()> { + use pageserver::tenant::blob_io::BlobCursor; + use pageserver::tenant::block_io::BlockReader; + + let path = path.as_ref(); + virtual_file::init(10); + page_cache::init(100); + let file = FileBlockReader::new(VirtualFile::open(path)?); + let summary_blk = file.read_blk(0)?; + let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + actual_summary.index_start_blk, + actual_summary.index_root_blk, + &file, + ); + // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API. + let mut all = vec![]; + tree_reader.visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |key, value_offset| { + let curr = Key::from_slice(&key[..KEY_SIZE]); + all.push((curr, BlobRef(value_offset))); + true + }, + )?; + let mut cursor = BlockCursor::new(&file); + for (k, v) in all { + let value = cursor.read_blob(v.pos())?; + println!("key:{} value_len:{}", k, value.len()); + } + // TODO(chi): special handling for last key? + Ok(()) +} + +pub(crate) fn main(cmd: &LayerCmd) -> Result<()> { + match cmd { + LayerCmd::List { path } => { + for tenant in fs::read_dir(path.join("tenants"))? { + let tenant = tenant?; + if !tenant.file_type()?.is_dir() { + continue; + } + println!("tenant {}", tenant.file_name().to_string_lossy()); + for timeline in fs::read_dir(tenant.path().join("timelines"))? { + let timeline = timeline?; + if !timeline.file_type()?.is_dir() { + continue; + } + println!("- timeline {}", timeline.file_name().to_string_lossy()); + } + } + } + LayerCmd::ListLayer { + path, + tenant, + timeline, + } => { + let timeline_path = path + .join("tenants") + .join(tenant) + .join("timelines") + .join(timeline); + let mut idx = 0; + for layer in fs::read_dir(timeline_path)? { + let layer = layer?; + if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) + { + println!( + "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", + idx, + layer_file.key_range.start, + layer_file.key_range.end, + layer_file.lsn_range.start, + layer_file.lsn_range.end, + layer_file.is_delta, + ); + idx += 1; + } + } + } + LayerCmd::DumpLayer { + path, + tenant, + timeline, + id, + } => { + let timeline_path = path + .join("tenants") + .join(tenant) + .join("timelines") + .join(timeline); + let mut idx = 0; + for layer in fs::read_dir(timeline_path)? { + let layer = layer?; + if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) + { + if *id == idx { + // TODO(chi): dedup code + println!( + "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", + idx, + layer_file.key_range.start, + layer_file.key_range.end, + layer_file.lsn_range.start, + layer_file.lsn_range.end, + layer_file.is_delta, + ); + + if layer_file.is_delta { + read_delta_file(layer.path())?; + } else { + anyhow::bail!("not supported yet :("); + } + + break; + } + idx += 1; + } + } + } + } + Ok(()) +} diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs new file mode 100644 index 0000000000..55db9eb7e7 --- /dev/null +++ b/pageserver/ctl/src/main.rs @@ -0,0 +1,179 @@ +//! A helper tool to manage pageserver binary files. +//! Accepts a file as an argument, attempts to parse it with all ways possible +//! and prints its interpreted context. +//! +//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file. + +mod draw_timeline_dir; +mod layer_map_analyzer; +mod layers; + +use clap::{Parser, Subcommand}; +use layers::LayerCmd; +use pageserver::{ + context::{DownloadBehavior, RequestContext}, + page_cache, + task_mgr::TaskKind, + tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, + virtual_file, +}; +use postgres_ffi::ControlFileData; +use std::path::{Path, PathBuf}; +use utils::{lsn::Lsn, project_git_version}; + +project_git_version!(GIT_VERSION); + +#[derive(Parser)] +#[command( + version = GIT_VERSION, + about = "Neon Pageserver binutils", + long_about = "Reads pageserver (and related) binary files management utility" +)] +#[command(propagate_version = true)] +struct CliOpts { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + Metadata(MetadataCmd), + PrintLayerFile(PrintLayerFileCmd), + DrawTimeline {}, + AnalyzeLayerMap(AnalyzeLayerMapCmd), + #[command(subcommand)] + Layer(LayerCmd), +} + +/// Read and update pageserver metadata file +#[derive(Parser)] +struct MetadataCmd { + /// Input metadata file path + metadata_path: PathBuf, + /// Replace disk consistent Lsn + disk_consistent_lsn: Option, + /// Replace previous record Lsn + prev_record_lsn: Option, + /// Replace latest gc cuttoff + latest_gc_cuttoff: Option, +} + +#[derive(Parser)] +struct PrintLayerFileCmd { + /// Pageserver data path + path: PathBuf, +} + +#[derive(Parser)] +struct AnalyzeLayerMapCmd { + /// Pageserver data path + path: PathBuf, + /// Max holes + max_holes: Option, +} + +fn main() -> anyhow::Result<()> { + let cli = CliOpts::parse(); + + match cli.command { + Commands::Layer(cmd) => { + layers::main(&cmd)?; + } + Commands::Metadata(cmd) => { + handle_metadata(&cmd)?; + } + Commands::DrawTimeline {} => { + draw_timeline_dir::main()?; + } + Commands::AnalyzeLayerMap(cmd) => { + layer_map_analyzer::main(&cmd)?; + } + Commands::PrintLayerFile(cmd) => { + if let Err(e) = read_pg_control_file(&cmd.path) { + println!( + "Failed to read input file as a pg control one: {e:#}\n\ + Attempting to read it as layer file" + ); + print_layerfile(&cmd.path)?; + } + } + }; + Ok(()) +} + +fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> { + let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?; + println!("{control_file:?}"); + let control_file_initdb = Lsn(control_file.checkPoint); + println!( + "pg_initdb_lsn: {}, aligned: {}", + control_file_initdb, + control_file_initdb.align() + ); + Ok(()) +} + +fn print_layerfile(path: &Path) -> anyhow::Result<()> { + // Basic initialization of things that don't change after startup + virtual_file::init(10); + page_cache::init(100); + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + dump_layerfile_from_path(path, true, &ctx) +} + +fn handle_metadata( + MetadataCmd { + metadata_path: path, + disk_consistent_lsn, + prev_record_lsn, + latest_gc_cuttoff, + }: &MetadataCmd, +) -> Result<(), anyhow::Error> { + let metadata_bytes = std::fs::read(path)?; + let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; + println!("Current metadata:\n{meta:?}"); + let mut update_meta = false; + if let Some(disk_consistent_lsn) = disk_consistent_lsn { + meta = TimelineMetadata::new( + *disk_consistent_lsn, + meta.prev_record_lsn(), + meta.ancestor_timeline(), + meta.ancestor_lsn(), + meta.latest_gc_cutoff_lsn(), + meta.initdb_lsn(), + meta.pg_version(), + ); + update_meta = true; + } + if let Some(prev_record_lsn) = prev_record_lsn { + meta = TimelineMetadata::new( + meta.disk_consistent_lsn(), + Some(*prev_record_lsn), + meta.ancestor_timeline(), + meta.ancestor_lsn(), + meta.latest_gc_cutoff_lsn(), + meta.initdb_lsn(), + meta.pg_version(), + ); + update_meta = true; + } + if let Some(latest_gc_cuttoff) = latest_gc_cuttoff { + meta = TimelineMetadata::new( + meta.disk_consistent_lsn(), + meta.prev_record_lsn(), + meta.ancestor_timeline(), + meta.ancestor_lsn(), + *latest_gc_cuttoff, + meta.initdb_lsn(), + meta.pg_version(), + ); + update_meta = true; + } + + if update_meta { + let metadata_bytes = meta.to_bytes()?; + std::fs::write(path, metadata_bytes)?; + } + + Ok(()) +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index d843b01ed7..1fa5e4ab3b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -9,6 +9,7 @@ use clap::{Arg, ArgAction, Command}; use fail::FailScenario; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; +use pageserver::task_mgr::WALRECEIVER_RUNTIME; use remote_storage::GenericRemoteStorage; use tracing::*; @@ -18,9 +19,7 @@ use pageserver::{ context::{DownloadBehavior, RequestContext}, http, page_cache, page_service, task_mgr, task_mgr::TaskKind, - task_mgr::{ - BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, - }, + task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME}, tenant::mgr, virtual_file, }; @@ -276,7 +275,18 @@ fn start_pageserver( let pageserver_listener = tcp_listener::bind(pg_addr)?; // Launch broker client - WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?; + // The storage_broker::connect call needs to happen inside a tokio runtime thread. + let broker_client = WALRECEIVER_RUNTIME + .block_on(async { + // Note: we do not attempt connecting here (but validate endpoints sanity). + storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval) + }) + .with_context(|| { + format!( + "create broker client for uri={:?} keepalive_interval={:?}", + &conf.broker_endpoint, conf.broker_keepalive_interval, + ) + })?; // Initialize authentication for incoming connections let http_auth; @@ -325,8 +335,118 @@ fn start_pageserver( // Set up remote storage client let remote_storage = create_remote_storage_client(conf)?; + // Startup staging or optimizing: + // + // We want to minimize downtime for `page_service` connections, and trying not to overload + // BACKGROUND_RUNTIME by doing initial compactions and initial logical sizes at the same time. + // + // init_done_rx will notify when all initial load operations have completed. + // + // background_jobs_can_start (same name used to hold off background jobs from starting at + // consumer side) will be dropped once we can start the background jobs. Currently it is behind + // completing all initial logical size calculations (init_logical_size_done_rx) and a timeout + // (background_task_maximum_delay). + let (init_done_tx, init_done_rx) = utils::completion::channel(); + + let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel(); + + let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel(); + + let order = pageserver::InitializationOrder { + initial_tenant_load: Some(init_done_tx), + initial_logical_size_can_start: init_done_rx.clone(), + initial_logical_size_attempt: init_logical_size_done_tx, + background_jobs_can_start: background_jobs_barrier.clone(), + }; + // Scan the local 'tenants/' directory and start loading the tenants - BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?; + let init_started_at = std::time::Instant::now(); + let shutdown_pageserver = tokio_util::sync::CancellationToken::new(); + + BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( + conf, + broker_client.clone(), + remote_storage.clone(), + order, + ))?; + + BACKGROUND_RUNTIME.spawn({ + let init_done_rx = init_done_rx; + let shutdown_pageserver = shutdown_pageserver.clone(); + let drive_init = async move { + // NOTE: unlike many futures in pageserver, this one is cancellation-safe + let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed")); + + init_done_rx.wait().await; + // initial logical sizes can now start, as they were waiting on init_done_rx. + + scopeguard::ScopeGuard::into_inner(guard); + + let init_done = std::time::Instant::now(); + let elapsed = init_done - init_started_at; + + tracing::info!( + elapsed_millis = elapsed.as_millis(), + "Initial load completed" + ); + + let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait()); + + let timeout = conf.background_task_maximum_delay; + + let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed")); + + let init_sizes_done = tokio::select! { + _ = &mut init_sizes_done => { + let now = std::time::Instant::now(); + tracing::info!( + from_init_done_millis = (now - init_done).as_millis(), + from_init_millis = (now - init_started_at).as_millis(), + "Initial logical sizes completed" + ); + None + } + _ = tokio::time::sleep(timeout) => { + tracing::info!( + timeout_millis = timeout.as_millis(), + "Initial logical size timeout elapsed; starting background jobs" + ); + Some(init_sizes_done) + } + }; + + scopeguard::ScopeGuard::into_inner(guard); + + // allow background jobs to start + drop(background_jobs_can_start); + + if let Some(init_sizes_done) = init_sizes_done { + // ending up here is not a bug; at the latest logical sizes will be queried by + // consumption metrics. + let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed")); + init_sizes_done.await; + + scopeguard::ScopeGuard::into_inner(guard); + + let now = std::time::Instant::now(); + tracing::info!( + from_init_done_millis = (now - init_done).as_millis(), + from_init_millis = (now - init_started_at).as_millis(), + "Initial logical sizes completed after timeout (background jobs already started)" + ); + + } + }; + + async move { + let mut drive_init = std::pin::pin!(drive_init); + // just race these tasks + tokio::select! { + _ = shutdown_pageserver.cancelled() => {}, + _ = &mut drive_init => {}, + } + } + }); // shared state between the disk-usage backed eviction background task and the http endpoint // that allows triggering disk-usage based eviction manually. note that the http endpoint @@ -339,6 +459,7 @@ fn start_pageserver( conf, remote_storage.clone(), disk_usage_eviction_state.clone(), + background_jobs_barrier.clone(), )?; } @@ -351,6 +472,7 @@ fn start_pageserver( conf, launch_ts, http_auth, + broker_client.clone(), remote_storage, disk_usage_eviction_state, )? @@ -375,6 +497,7 @@ fn start_pageserver( ); if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint { + let background_jobs_barrier = background_jobs_barrier; let metrics_ctx = RequestContext::todo_child( TaskKind::MetricsCollection, // This task itself shouldn't download anything. @@ -390,6 +513,18 @@ fn start_pageserver( "consumption metrics collection", true, async move { + // first wait until background jobs are cleared to launch. + // + // this is because we only process active tenants and timelines, and the + // Timeline::get_current_logical_size will spawn the logical size calculation, + // which will not be rate-limited. + let cancel = task_mgr::shutdown_token(); + + tokio::select! { + _ = cancel.cancelled() => { return Ok(()); }, + _ = background_jobs_barrier.wait() => {} + }; + pageserver::consumption_metrics::collect_metrics( metric_collection_endpoint, conf.metric_collection_interval, @@ -427,6 +562,7 @@ fn start_pageserver( async move { page_service::libpq_listener_main( conf, + broker_client, pg_auth, pageserver_listener, conf.pg_auth_type, @@ -437,6 +573,8 @@ fn start_pageserver( ); } + let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); + // All started up! Now just sit and wait for shutdown signal. ShutdownSignals::handle(|signal| match signal { Signal::Quit => { @@ -452,6 +590,11 @@ fn start_pageserver( "Got {}. Terminating gracefully in fast shutdown mode", signal.name() ); + + // This cancels the `shutdown_pageserver` cancellation tree. + // Right now that tree doesn't reach very far, and `task_mgr` is used instead. + // The plan is to change that over time. + shutdown_pageserver.take(); BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0)); unreachable!() } diff --git a/pageserver/src/bin/pageserver_binutils.rs b/pageserver/src/bin/pageserver_binutils.rs deleted file mode 100644 index 5e2d39d685..0000000000 --- a/pageserver/src/bin/pageserver_binutils.rs +++ /dev/null @@ -1,174 +0,0 @@ -//! A helper tool to manage pageserver binary files. -//! Accepts a file as an argument, attempts to parse it with all ways possible -//! and prints its interpreted context. -//! -//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file. -use std::{ - path::{Path, PathBuf}, - str::FromStr, -}; - -use anyhow::Context; -use clap::{value_parser, Arg, Command}; - -use pageserver::{ - context::{DownloadBehavior, RequestContext}, - page_cache, - task_mgr::TaskKind, - tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, - virtual_file, -}; -use postgres_ffi::ControlFileData; -use utils::{lsn::Lsn, project_git_version}; - -project_git_version!(GIT_VERSION); - -const METADATA_SUBCOMMAND: &str = "metadata"; - -fn main() -> anyhow::Result<()> { - let arg_matches = cli().get_matches(); - - match arg_matches.subcommand() { - Some((subcommand_name, subcommand_matches)) => { - let path = subcommand_matches - .get_one::("metadata_path") - .context("'metadata_path' argument is missing")? - .to_path_buf(); - anyhow::ensure!( - subcommand_name == METADATA_SUBCOMMAND, - "Unknown subcommand {subcommand_name}" - ); - handle_metadata(&path, subcommand_matches)?; - } - None => { - let path = arg_matches - .get_one::("path") - .context("'path' argument is missing")? - .to_path_buf(); - println!( - "No subcommand specified, attempting to guess the format for file {}", - path.display() - ); - if let Err(e) = read_pg_control_file(&path) { - println!( - "Failed to read input file as a pg control one: {e:#}\n\ - Attempting to read it as layer file" - ); - print_layerfile(&path)?; - } - } - }; - Ok(()) -} - -fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> { - let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?; - println!("{control_file:?}"); - let control_file_initdb = Lsn(control_file.checkPoint); - println!( - "pg_initdb_lsn: {}, aligned: {}", - control_file_initdb, - control_file_initdb.align() - ); - Ok(()) -} - -fn print_layerfile(path: &Path) -> anyhow::Result<()> { - // Basic initialization of things that don't change after startup - virtual_file::init(10); - page_cache::init(100); - let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); - dump_layerfile_from_path(path, true, &ctx) -} - -fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> { - let metadata_bytes = std::fs::read(path)?; - let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; - println!("Current metadata:\n{meta:?}"); - let mut update_meta = false; - if let Some(disk_consistent_lsn) = arg_matches.get_one::("disk_consistent_lsn") { - meta = TimelineMetadata::new( - Lsn::from_str(disk_consistent_lsn)?, - meta.prev_record_lsn(), - meta.ancestor_timeline(), - meta.ancestor_lsn(), - meta.latest_gc_cutoff_lsn(), - meta.initdb_lsn(), - meta.pg_version(), - ); - update_meta = true; - } - if let Some(prev_record_lsn) = arg_matches.get_one::("prev_record_lsn") { - meta = TimelineMetadata::new( - meta.disk_consistent_lsn(), - Some(Lsn::from_str(prev_record_lsn)?), - meta.ancestor_timeline(), - meta.ancestor_lsn(), - meta.latest_gc_cutoff_lsn(), - meta.initdb_lsn(), - meta.pg_version(), - ); - update_meta = true; - } - if let Some(latest_gc_cuttoff) = arg_matches.get_one::("latest_gc_cuttoff") { - meta = TimelineMetadata::new( - meta.disk_consistent_lsn(), - meta.prev_record_lsn(), - meta.ancestor_timeline(), - meta.ancestor_lsn(), - Lsn::from_str(latest_gc_cuttoff)?, - meta.initdb_lsn(), - meta.pg_version(), - ); - update_meta = true; - } - - if update_meta { - let metadata_bytes = meta.to_bytes()?; - std::fs::write(path, metadata_bytes)?; - } - - Ok(()) -} - -fn cli() -> Command { - Command::new("Neon Pageserver binutils") - .about("Reads pageserver (and related) binary files management utility") - .version(GIT_VERSION) - .arg( - Arg::new("path") - .help("Input file path") - .value_parser(value_parser!(PathBuf)) - .required(false), - ) - .subcommand( - Command::new(METADATA_SUBCOMMAND) - .about("Read and update pageserver metadata file") - .arg( - Arg::new("metadata_path") - .help("Input metadata file path") - .value_parser(value_parser!(PathBuf)) - .required(false), - ) - .arg( - Arg::new("disk_consistent_lsn") - .long("disk_consistent_lsn") - .help("Replace disk consistent Lsn"), - ) - .arg( - Arg::new("prev_record_lsn") - .long("prev_record_lsn") - .help("Replace previous record Lsn"), - ) - .arg( - Arg::new("latest_gc_cuttoff") - .long("latest_gc_cuttoff") - .help("Replace latest gc cuttoff"), - ), - ) -} - -#[test] -fn verify_cli() { - cli().debug_assert(); -} diff --git a/pageserver/src/broker_client.rs b/pageserver/src/broker_client.rs deleted file mode 100644 index 6c92967ca3..0000000000 --- a/pageserver/src/broker_client.rs +++ /dev/null @@ -1,48 +0,0 @@ -//! The broker client instance of the pageserver, created during pageserver startup. -//! Used by each timelines' [`walreceiver`]. - -use crate::config::PageServerConf; - -use anyhow::Context; -use once_cell::sync::OnceCell; -use storage_broker::BrokerClientChannel; -use tracing::*; - -static BROKER_CLIENT: OnceCell = OnceCell::new(); - -/// -/// Initialize the broker client. This must be called once at page server startup. -/// -pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> { - let broker_endpoint = conf.broker_endpoint.clone(); - - // Note: we do not attempt connecting here (but validate endpoints sanity). - let broker_client = - storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context( - format!( - "Failed to create broker client to {}", - &conf.broker_endpoint - ), - )?; - - if BROKER_CLIENT.set(broker_client).is_err() { - panic!("broker already initialized"); - } - - info!( - "Initialized broker client with endpoints: {}", - broker_endpoint - ); - Ok(()) -} - -/// -/// Get a handle to the broker client -/// -pub fn get_broker_client() -> &'static BrokerClientChannel { - BROKER_CLIENT.get().expect("broker client not initialized") -} - -pub fn is_broker_client_initialized() -> bool { - BROKER_CLIENT.get().is_some() -} diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 88a7f15b21..17e6e3fb2a 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -63,6 +63,7 @@ pub mod defaults { pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour"; pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; + pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; /// /// Default built-in configuration file. @@ -91,9 +92,10 @@ pub mod defaults { #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}' #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' - #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} +#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}' + # [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -108,7 +110,7 @@ pub mod defaults { #min_resident_size_override = .. # in bytes #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' - +#gc_feedback = false # [remote_storage] "### @@ -187,6 +189,15 @@ pub struct PageServerConf { pub test_remote_failures: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, + + /// How long will background tasks be delayed at most after initial load of tenants. + /// + /// Our largest initialization completions are in the range of 100-200s, so perhaps 10s works + /// as we now isolate initial loading, initial logical size calculation and background tasks. + /// Smaller nodes will have background tasks "not running" for this long unless every timeline + /// has it's initial logical size calculated. Not running background tasks for some seconds is + /// not terrible. + pub background_task_maximum_delay: Duration, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -259,6 +270,8 @@ struct PageServerConfigBuilder { test_remote_failures: BuilderValue, ondemand_download_behavior_treat_error_as_warn: BuilderValue, + + background_task_maximum_delay: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -316,6 +329,11 @@ impl Default for PageServerConfigBuilder { test_remote_failures: Set(0), ondemand_download_behavior_treat_error_as_warn: Set(false), + + background_task_maximum_delay: Set(humantime::parse_duration( + DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY, + ) + .unwrap()), } } } @@ -440,6 +458,10 @@ impl PageServerConfigBuilder { BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn); } + pub fn background_task_maximum_delay(&mut self, delay: Duration) { + self.background_task_maximum_delay = BuilderValue::Set(delay); + } + pub fn build(self) -> anyhow::Result { let concurrent_tenant_size_logical_size_queries = self .concurrent_tenant_size_logical_size_queries @@ -522,6 +544,9 @@ impl PageServerConfigBuilder { .ok_or(anyhow!( "missing ondemand_download_behavior_treat_error_as_warn" ))?, + background_task_maximum_delay: self + .background_task_maximum_delay + .ok_or(anyhow!("missing background_task_maximum_delay"))?, }) } } @@ -710,6 +735,7 @@ impl PageServerConf { ) }, "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), + "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -828,6 +854,14 @@ impl PageServerConf { )?); } + if let Some(gc_feedback) = item.get("gc_feedback") { + t_conf.gc_feedback = Some( + gc_feedback + .as_bool() + .with_context(|| "configure option gc_feedback is not a bool".to_string())?, + ); + } + Ok(t_conf) } @@ -869,6 +903,7 @@ impl PageServerConf { disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, + background_task_maximum_delay: Duration::ZERO, } } } @@ -1028,6 +1063,7 @@ metric_collection_endpoint = 'http://localhost:80/metrics' synthetic_size_calculation_interval = '333 s' log_format = 'json' +background_task_maximum_delay = '334 s' "#; @@ -1086,6 +1122,9 @@ log_format = 'json' disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, + background_task_maximum_delay: humantime::parse_duration( + defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY + )?, }, "Correct defaults should be used when no config values are provided" ); @@ -1140,6 +1179,7 @@ log_format = 'json' disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, + background_task_maximum_delay: Duration::from_secs(334), }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index e826d28e6d..f53b7736ab 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -88,6 +88,7 @@ use crate::task_mgr::TaskKind; // The main structure of this module, see module-level comment. +#[derive(Clone, Debug)] pub struct RequestContext { task_kind: TaskKind, download_behavior: DownloadBehavior, @@ -95,7 +96,7 @@ pub struct RequestContext { /// Desired behavior if the operation requires an on-demand download /// to proceed. -#[derive(Clone, Copy, PartialEq, Eq)] +#[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum DownloadBehavior { /// Download the layer file. It can take a while. Download, diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index f4a0f3f18e..ce5f81c44b 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -54,6 +54,7 @@ use serde::{Deserialize, Serialize}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn, Instrument}; +use utils::completion; use utils::serde_percent::Percent; use crate::{ @@ -82,6 +83,7 @@ pub fn launch_disk_usage_global_eviction_task( conf: &'static PageServerConf, storage: GenericRemoteStorage, state: Arc, + background_jobs_barrier: completion::Barrier, ) -> anyhow::Result<()> { let Some(task_config) = &conf.disk_usage_based_eviction else { info!("disk usage based eviction task not configured"); @@ -98,14 +100,16 @@ pub fn launch_disk_usage_global_eviction_task( "disk usage based eviction", false, async move { - disk_usage_eviction_task( - &state, - task_config, - storage, - &conf.tenants_path(), - task_mgr::shutdown_token(), - ) - .await; + let cancel = task_mgr::shutdown_token(); + + // wait until initial load is complete, because we cannot evict from loading tenants. + tokio::select! { + _ = cancel.cancelled() => { return Ok(()); }, + _ = background_jobs_barrier.wait() => { } + }; + + disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel) + .await; info!("disk usage based eviction task finishing"); Ok(()) }, diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index f62828be21..50614653be 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -215,7 +215,7 @@ paths: schema: $ref: "#/components/schemas/NotFoundError" "412": - description: Tenant is missing + description: Tenant is missing, or timeline has children content: application/json: schema: @@ -363,11 +363,30 @@ paths: * MUST NOT ASSUME that the request has been lost, based on the observation that a subsequent tenant status request returns 404. The request may still be in flight. It must be retried. + + The client SHOULD supply a `TenantConfig` for the tenant in the request body. + Settings specified in the config override the pageserver's defaults. + It is guaranteed that the config settings are applied before the pageserver + starts operating on the tenant. E.g., if the config specifies a specific + PITR interval for a tenant, then that setting will be in effect before the + pageserver starts the garbage collection loop. This enables a client to + guarantee a specific PITR setting across detach/attach cycles. + The pageserver will reject the request if it cannot parse the config, or + if there are any unknown fields in it. + + If the client does not supply a config, the pageserver will use its defaults. + This behavior is deprecated: https://github.com/neondatabase/neon/issues/4282 + requestBody: + required: false + content: + application/json: + schema: + $ref: "#/components/schemas/TenantAttachRequest" responses: "202": description: Tenant attaching scheduled "400": - description: Error when no tenant id found in path parameters + description: Bad Request content: application/json: schema: @@ -660,6 +679,8 @@ paths: application/json: schema: type: object + required: + - new_timeline_id properties: new_timeline_id: type: string @@ -908,12 +929,28 @@ components: writing to the tenant's S3 state, so, DO NOT ATTACH the tenant to any other pageserver, or we risk split-brain. - `attached` means that the attach operation has completed, - maybe successfully, maybe not. Perform a health check at - the Postgres level to determine healthiness of the tenant. + successfully + - `failed` means that attach has failed. For reason check corresponding `reason` failed. + `failed` is the terminal state, retrying attach call wont resolve the issue. + For example this can be caused by s3 being unreachable. The retry may be implemented + with call to detach, though it would be better to not automate it and inspec failed state + manually before proceeding with a retry. See the tenant `/attach` endpoint for more information. - type: string - enum: [ "maybe", "attached" ] + type: object + required: + - slug + - data + properties: + slug: + type: string + enum: [ "maybe", "attached", "failed" ] + data: + type: object + properties: + reason: + type: string + TenantCreateRequest: allOf: - $ref: '#/components/schemas/TenantConfig' @@ -924,6 +961,13 @@ components: new_tenant_id: type: string format: hex + TenantAttachRequest: + type: object + required: + - config + properties: + config: + $ref: '#/components/schemas/TenantConfig' TenantConfigRequest: allOf: - $ref: '#/components/schemas/TenantConfig' diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 3f3c86da6b..6f014161f2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,3 +1,6 @@ +//! +//! Management HTTP API +//! use std::collections::HashMap; use std::sync::Arc; @@ -5,12 +8,14 @@ use anyhow::{anyhow, Context, Result}; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; -use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; +use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest}; use remote_storage::GenericRemoteStorage; +use storage_broker::BrokerClientChannel; use tenant_size_model::{SizeResult, StorageModel}; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::http::endpoint::RequestSpan; +use utils::http::endpoint::request_span; +use utils::http::json::json_request_or_empty_body; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; use super::models::{ @@ -23,7 +28,9 @@ use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; -use crate::tenant::mgr::{TenantMapInsertError, TenantStateError}; +use crate::tenant::mgr::{ + GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError, +}; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; @@ -42,7 +49,6 @@ use utils::{ }; // Imports only used for testing APIs -#[cfg(feature = "testing")] use super::models::ConfigureFailpointsRequest; struct State { @@ -50,6 +56,7 @@ struct State { auth: Option>, allowlist_routes: Vec, remote_storage: Option, + broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, } @@ -58,6 +65,7 @@ impl State { conf: &'static PageServerConf, auth: Option>, remote_storage: Option, + broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] @@ -69,6 +77,7 @@ impl State { auth, allowlist_routes, remote_storage, + broker_client, disk_usage_eviction_state, }) } @@ -139,14 +148,45 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(tse: GetTenantError) -> ApiError { + match tse { + GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)), + e @ GetTenantError::NotActive(_) => { + // Why is this not `ApiError::NotFound`? + // Because we must be careful to never return 404 for a tenant if it does + // in fact exist locally. If we did, the caller could draw the conclusion + // that it can attach the tenant to another PS and we'd be in split-brain. + // + // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls). + ApiError::InternalServerError(anyhow::Error::new(e)) + } + } + } +} + +impl From for ApiError { + fn from(e: SetNewTenantConfigError) -> ApiError { + match e { + SetNewTenantConfigError::GetTenant(tid) => { + ApiError::NotFound(anyhow!("tenant {}", tid)) + } + e @ SetNewTenantConfigError::Persist(_) => { + ApiError::InternalServerError(anyhow::Error::new(e)) + } + } + } +} + impl From for ApiError { fn from(value: crate::tenant::DeleteTimelineError) -> Self { use crate::tenant::DeleteTimelineError::*; match value { NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")), - HasChildren => ApiError::BadRequest(anyhow::anyhow!( - "Cannot delete timeline which has child timelines" - )), + HasChildren(children) => ApiError::PreconditionFailed( + format!("Cannot delete timeline which has child timelines: {children:?}") + .into_boxed_str(), + ), Other(e) => ApiError::InternalServerError(e), } } @@ -158,9 +198,9 @@ impl From for ApiError { match value { // Report Precondition failed so client can distinguish between // "tenant is missing" case from "timeline is missing" - Tenant(TenantStateError::NotFound(..)) => { - ApiError::PreconditionFailed("Requested tenant is missing") - } + Tenant(GetTenantError::NotFound(..)) => ApiError::PreconditionFailed( + "Requested tenant is missing".to_owned().into_boxed_str(), + ), Tenant(t) => ApiError::from(t), Timeline(t) => ApiError::from(t), } @@ -253,23 +293,29 @@ fn build_timeline_info_common( } // healthcheck handler -async fn status_handler(request: Request) -> Result, ApiError> { +async fn status_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { check_permission(&request, None)?; let config = get_config(&request); json_response(StatusCode::OK, StatusResponse { id: config.id }) } -async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { +async fn timeline_create_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let request_data: TimelineCreateRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_id))?; - let new_timeline_id = request_data - .new_timeline_id - .unwrap_or_else(TimelineId::generate); + let new_timeline_id = request_data.new_timeline_id; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error); + let state = get_state(&request); + async { let tenant = mgr::get_tenant(tenant_id, true).await?; match tenant.create_timeline( @@ -277,6 +323,7 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result Err(ApiError::InternalServerError(err)), } } - .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) + .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) .await } -async fn timeline_list_handler(request: Request) -> Result, ApiError> { +async fn timeline_list_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; @@ -328,7 +378,10 @@ async fn timeline_list_handler(request: Request) -> Result, json_response(StatusCode::OK, response_data) } -async fn timeline_detail_handler(request: Request) -> Result, ApiError> { +async fn timeline_detail_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let include_non_incremental_logical_size: Option = @@ -362,7 +415,10 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result, ApiError> { +async fn get_lsn_by_timestamp_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -386,11 +442,19 @@ async fn get_lsn_by_timestamp_handler(request: Request) -> Result) -> Result, ApiError> { +async fn tenant_attach_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; + let maybe_body: Option = json_request_or_empty_body(&mut request).await?; + let tenant_conf = match maybe_body { + Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?, + None => TenantConfOpt::default(), + }; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); info!("Handling tenant attach {tenant_id}"); @@ -401,9 +465,8 @@ async fn tenant_attach_handler(request: Request) -> Result, mgr::attach_tenant( state.conf, tenant_id, - // XXX: Attach should provide the config, especially during tenant migration. - // See https://github.com/neondatabase/neon/issues/1555 - TenantConfOpt::default(), + tenant_conf, + state.broker_client.clone(), remote_storage.clone(), &ctx, ) @@ -418,7 +481,10 @@ async fn tenant_attach_handler(request: Request) -> Result, json_response(StatusCode::ACCEPTED, ()) } -async fn timeline_delete_handler(request: Request) -> Result, ApiError> { +async fn timeline_delete_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; @@ -429,10 +495,14 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, ApiError> { +async fn tenant_detach_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let detach_ignored: Option = parse_query_param(&request, "detach_ignored")?; @@ -446,21 +516,33 @@ async fn tenant_detach_handler(request: Request) -> Result, json_response(StatusCode::OK, ()) } -async fn tenant_load_handler(request: Request) -> Result, ApiError> { +async fn tenant_load_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); let state = get_state(&request); - mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx) - .instrument(info_span!("load", tenant = %tenant_id)) - .await?; + mgr::load_tenant( + state.conf, + tenant_id, + state.broker_client.clone(), + state.remote_storage.clone(), + &ctx, + ) + .instrument(info_span!("load", tenant = %tenant_id)) + .await?; json_response(StatusCode::ACCEPTED, ()) } -async fn tenant_ignore_handler(request: Request) -> Result, ApiError> { +async fn tenant_ignore_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -473,7 +555,10 @@ async fn tenant_ignore_handler(request: Request) -> Result, json_response(StatusCode::OK, ()) } -async fn tenant_list_handler(request: Request) -> Result, ApiError> { +async fn tenant_list_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { check_permission(&request, None)?; let response_data = mgr::list_tenants() @@ -493,7 +578,10 @@ async fn tenant_list_handler(request: Request) -> Result, A json_response(StatusCode::OK, response_data) } -async fn tenant_status(request: Request) -> Result, ApiError> { +async fn tenant_status( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -507,7 +595,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro } let state = tenant.current_state(); - Ok(TenantInfo { + Result::<_, ApiError>::Ok(TenantInfo { id: tenant_id, state: state.clone(), current_physical_size: Some(current_physical_size), @@ -515,8 +603,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro }) } .instrument(info_span!("tenant_status_handler", tenant = %tenant_id)) - .await - .map_err(ApiError::InternalServerError)?; + .await?; json_response(StatusCode::OK, tenant_info) } @@ -534,7 +621,10 @@ async fn tenant_status(request: Request) -> Result, ApiErro /// Note: we don't update the cached size and prometheus metric here. /// The retention period might be different, and it's nice to have a method to just calculate it /// without modifying anything anyway. -async fn tenant_size_handler(request: Request) -> Result, ApiError> { +async fn tenant_size_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let inputs_only: Option = parse_query_param(&request, "inputs_only")?; @@ -599,7 +689,10 @@ async fn tenant_size_handler(request: Request) -> Result, A ) } -async fn layer_map_info_handler(request: Request) -> Result, ApiError> { +async fn layer_map_info_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let reset: LayerAccessStatsReset = @@ -613,7 +706,10 @@ async fn layer_map_info_handler(request: Request) -> Result json_response(StatusCode::OK, layer_map_info) } -async fn layer_download_handler(request: Request) -> Result, ApiError> { +async fn layer_download_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -636,7 +732,10 @@ async fn layer_download_handler(request: Request) -> Result } } -async fn evict_timeline_layer_handler(request: Request) -> Result, ApiError> { +async fn evict_timeline_layer_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -714,7 +813,10 @@ pub fn html_response(status: StatusCode, data: String) -> Result, Ok(response) } -async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { +async fn tenant_create_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let request_data: TenantCreateRequest = json_request(&mut request).await?; let target_tenant_id = request_data.new_tenant_id; check_permission(&request, Some(target_tenant_id))?; @@ -735,6 +837,7 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result) -> Result, ApiError> { +async fn get_tenant_config_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -786,6 +892,7 @@ async fn get_tenant_config_handler(request: Request) -> Result, + _cancel: CancellationToken, ) -> Result, ApiError> { let request_data: TenantConfigRequest = json_request(&mut request).await?; let tenant_id = request_data.tenant_id; @@ -803,21 +910,25 @@ async fn update_tenant_config_handler( } /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`]. -#[cfg(feature = "testing")] -async fn handle_tenant_break(r: Request) -> Result, ApiError> { +async fn handle_tenant_break( + r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?; let tenant = crate::tenant::mgr::get_tenant(tenant_id, true) .await .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?; - tenant.set_broken("broken from test".to_owned()); + tenant.set_broken("broken from test".to_owned()).await; json_response(StatusCode::OK, ()) } -#[cfg(feature = "testing")] -async fn failpoints_handler(mut request: Request) -> Result, ApiError> { +async fn failpoints_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { if !fail::has_failpoints() { return Err(ApiError::BadRequest(anyhow!( "Cannot manage failpoints because pageserver was compiled without failpoints support" @@ -850,7 +961,10 @@ async fn failpoints_handler(mut request: Request) -> Result } // Run GC immediately on given timeline. -async fn timeline_gc_handler(mut request: Request) -> Result, ApiError> { +async fn timeline_gc_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; @@ -869,8 +983,10 @@ async fn timeline_gc_handler(mut request: Request) -> Result) -> Result, ApiError> { +async fn timeline_compact_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; @@ -891,8 +1007,10 @@ async fn timeline_compact_handler(request: Request) -> Result) -> Result, ApiError> { +async fn timeline_checkpoint_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; @@ -916,6 +1034,7 @@ async fn timeline_checkpoint_handler(request: Request) -> Result, + _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -931,6 +1050,7 @@ async fn timeline_download_remote_layers_handler_post( async fn timeline_download_remote_layers_handler_get( request: Request, + _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -954,7 +1074,10 @@ async fn active_timeline_of_active_tenant( .map_err(ApiError::NotFound) } -async fn always_panic_handler(req: Request) -> Result, ApiError> { +async fn always_panic_handler( + req: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook(). // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it. // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic. @@ -965,7 +1088,10 @@ async fn always_panic_handler(req: Request) -> Result, ApiE json_response(StatusCode::NO_CONTENT, ()) } -async fn disk_usage_eviction_run(mut r: Request) -> Result, ApiError> { +async fn disk_usage_eviction_run( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { check_permission(&r, None)?; #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)] @@ -1055,8 +1181,10 @@ async fn handler_404(_: Request) -> Result, ApiError> { ) } -#[cfg(feature = "testing")] -async fn post_tracing_event_handler(mut r: Request) -> Result, ApiError> { +async fn post_tracing_event_handler( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { #[derive(Debug, serde::Deserialize)] #[serde(rename_all = "lowercase")] enum Level { @@ -1086,10 +1214,90 @@ async fn post_tracing_event_handler(mut r: Request) -> Result(request: Request, handler: H) -> Result, ApiError> +where + R: std::future::Future, ApiError>> + Send + 'static, + H: FnOnce(Request, CancellationToken) -> R + Send + Sync + 'static, +{ + // Spawn a new task to handle the request, to protect the handler from unexpected + // async cancellations. Most pageserver functions are not async cancellation safe. + // We arm a drop-guard, so that if Hyper drops the Future, we signal the task + // with the cancellation token. + let token = CancellationToken::new(); + let cancel_guard = token.clone().drop_guard(); + let result = request_span(request, move |r| async { + let handle = tokio::spawn( + async { + let token_cloned = token.clone(); + let result = handler(r, token).await; + if token_cloned.is_cancelled() { + info!("Cancelled request finished"); + } + result + } + .in_current_span(), + ); + + match handle.await { + Ok(result) => result, + Err(e) => { + // The handler task panicked. We have a global panic handler that logs the + // panic with its backtrace, so no need to log that here. Only log a brief + // message to make it clear that we returned the error to the client. + error!("HTTP request handler task panicked: {e:#}"); + + // Don't return an Error here, because then fallback error handler that was + // installed in make_router() will print the error. Instead, construct the + // HTTP error response and return that. + Ok( + ApiError::InternalServerError(anyhow!("HTTP request handler task panicked")) + .into_response(), + ) + } + } + }) + .await; + + cancel_guard.disarm(); + + result +} + +/// Like api_handler, but returns an error response if the server is built without +/// the 'testing' feature. +async fn testing_api_handler( + desc: &str, + request: Request, + handler: H, +) -> Result, ApiError> +where + R: std::future::Future, ApiError>> + Send + 'static, + H: FnOnce(Request, CancellationToken) -> R + Send + Sync + 'static, +{ + if cfg!(feature = "testing") { + api_handler(request, handler).await + } else { + std::future::ready(Err(ApiError::BadRequest(anyhow!( + "Cannot {desc} because pageserver was compiled without testing APIs", + )))) + .await + } +} + pub fn make_router( conf: &'static PageServerConf, launch_ts: &'static LaunchTimestamp, auth: Option>, + broker_client: BrokerClientChannel, remote_storage: Option, disk_usage_eviction_state: Arc, ) -> anyhow::Result> { @@ -1114,121 +1322,99 @@ pub fn make_router( .expect("construct launch timestamp header middleware"), ); - macro_rules! testing_api { - ($handler_desc:literal, $handler:path $(,)?) => {{ - #[cfg(not(feature = "testing"))] - async fn cfg_disabled(_req: Request) -> Result, ApiError> { - Err(ApiError::BadRequest(anyhow!(concat!( - "Cannot ", - $handler_desc, - " because pageserver was compiled without testing APIs", - )))) - } - - #[cfg(feature = "testing")] - let handler = $handler; - #[cfg(not(feature = "testing"))] - let handler = cfg_disabled; - - move |r| RequestSpan(handler).handle(r) - }}; - } - Ok(router .data(Arc::new( - State::new(conf, auth, remote_storage, disk_usage_eviction_state) - .context("Failed to initialize router state")?, + State::new( + conf, + auth, + remote_storage, + broker_client, + disk_usage_eviction_state, + ) + .context("Failed to initialize router state")?, )) - .get("/v1/status", |r| RequestSpan(status_handler).handle(r)) - .put( - "/v1/failpoints", - testing_api!("manage failpoints", failpoints_handler), - ) - .get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r)) - .post("/v1/tenant", |r| { - RequestSpan(tenant_create_handler).handle(r) - }) - .get("/v1/tenant/:tenant_id", |r| { - RequestSpan(tenant_status).handle(r) + .get("/v1/status", |r| api_handler(r, status_handler)) + .put("/v1/failpoints", |r| { + testing_api_handler("manage failpoints", r, failpoints_handler) }) + .get("/v1/tenant", |r| api_handler(r, tenant_list_handler)) + .post("/v1/tenant", |r| api_handler(r, tenant_create_handler)) + .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status)) .get("/v1/tenant/:tenant_id/synthetic_size", |r| { - RequestSpan(tenant_size_handler).handle(r) + api_handler(r, tenant_size_handler) }) .put("/v1/tenant/config", |r| { - RequestSpan(update_tenant_config_handler).handle(r) + api_handler(r, update_tenant_config_handler) }) .get("/v1/tenant/:tenant_id/config", |r| { - RequestSpan(get_tenant_config_handler).handle(r) + api_handler(r, get_tenant_config_handler) }) .get("/v1/tenant/:tenant_id/timeline", |r| { - RequestSpan(timeline_list_handler).handle(r) + api_handler(r, timeline_list_handler) }) .post("/v1/tenant/:tenant_id/timeline", |r| { - RequestSpan(timeline_create_handler).handle(r) + api_handler(r, timeline_create_handler) }) .post("/v1/tenant/:tenant_id/attach", |r| { - RequestSpan(tenant_attach_handler).handle(r) + api_handler(r, tenant_attach_handler) }) .post("/v1/tenant/:tenant_id/detach", |r| { - RequestSpan(tenant_detach_handler).handle(r) + api_handler(r, tenant_detach_handler) }) .post("/v1/tenant/:tenant_id/load", |r| { - RequestSpan(tenant_load_handler).handle(r) + api_handler(r, tenant_load_handler) }) .post("/v1/tenant/:tenant_id/ignore", |r| { - RequestSpan(tenant_ignore_handler).handle(r) + api_handler(r, tenant_ignore_handler) }) .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { - RequestSpan(timeline_detail_handler).handle(r) + api_handler(r, timeline_detail_handler) }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp", - |r| RequestSpan(get_lsn_by_timestamp_handler).handle(r), + |r| api_handler(r, get_lsn_by_timestamp_handler), ) .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| { - RequestSpan(timeline_gc_handler).handle(r) + api_handler(r, timeline_gc_handler) + }) + .put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| { + testing_api_handler("run timeline compaction", r, timeline_compact_handler) }) - .put( - "/v1/tenant/:tenant_id/timeline/:timeline_id/compact", - testing_api!("run timeline compaction", timeline_compact_handler), - ) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", - testing_api!("run timeline checkpoint", timeline_checkpoint_handler), + |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler), ) .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", - |r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r), + |r| api_handler(r, timeline_download_remote_layers_handler_post), ) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", - |r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r), + |r| api_handler(r, timeline_download_remote_layers_handler_get), ) .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { - RequestSpan(timeline_delete_handler).handle(r) + api_handler(r, timeline_delete_handler) }) .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| { - RequestSpan(layer_map_info_handler).handle(r) + api_handler(r, layer_map_info_handler) }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", - |r| RequestSpan(layer_download_handler).handle(r), + |r| api_handler(r, layer_download_handler), ) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", - |r| RequestSpan(evict_timeline_layer_handler).handle(r), + |r| api_handler(r, evict_timeline_layer_handler), ) .put("/v1/disk_usage_eviction/run", |r| { - RequestSpan(disk_usage_eviction_run).handle(r) + api_handler(r, disk_usage_eviction_run) + }) + .put("/v1/tenant/:tenant_id/break", |r| { + testing_api_handler("set tenant state to broken", r, handle_tenant_break) + }) + .get("/v1/panic", |r| api_handler(r, always_panic_handler)) + .post("/v1/tracing/event", |r| { + testing_api_handler("emit a tracing event", r, post_tracing_event_handler) }) - .put( - "/v1/tenant/:tenant_id/break", - testing_api!("set tenant state to broken", handle_tenant_break), - ) - .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r)) - .post( - "/v1/tracing/event", - testing_api!("emit a tracing event", post_tracing_event_handler), - ) .any(handler_404)) } diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index 64024a2d8d..20e6df9c7b 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -5,7 +5,7 @@ use std::ops::Range; /// /// Represents a set of Keys, in a compact form. /// -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Default)] pub struct KeySpace { /// Contiguous ranges of keys that belong to the key space. In key order, /// and with no overlap. @@ -61,6 +61,18 @@ impl KeySpace { KeyPartitioning { parts } } + + /// + /// Check if key space contains overlapping range + /// + pub fn overlaps(&self, range: &Range) -> bool { + match self.ranges.binary_search_by_key(&range.end, |r| r.start) { + Ok(0) => false, + Err(0) => false, + Ok(index) => self.ranges[index - 1].end > range.start, + Err(index) => self.ranges[index - 1].end > range.start, + } + } } /// @@ -129,3 +141,226 @@ impl KeySpaceAccum { } } } + +/// +/// A helper object, to collect a set of keys and key ranges into a KeySpace +/// object. Key ranges may be inserted in any order and can overlap. +/// +#[derive(Clone, Debug, Default)] +pub struct KeySpaceRandomAccum { + ranges: Vec>, +} + +impl KeySpaceRandomAccum { + pub fn new() -> Self { + Self { ranges: Vec::new() } + } + + pub fn add_key(&mut self, key: Key) { + self.add_range(singleton_range(key)) + } + + pub fn add_range(&mut self, range: Range) { + self.ranges.push(range); + } + + pub fn to_keyspace(mut self) -> KeySpace { + let mut ranges = Vec::new(); + if !self.ranges.is_empty() { + self.ranges.sort_by_key(|r| r.start); + let mut start = self.ranges.first().unwrap().start; + let mut end = self.ranges.first().unwrap().end; + for r in self.ranges { + assert!(r.start >= start); + if r.start > end { + ranges.push(start..end); + start = r.start; + end = r.end; + } else if r.end > end { + end = r.end; + } + } + ranges.push(start..end); + } + KeySpace { ranges } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fmt::Write; + + // Helper function to create a key range. + // + // Make the tests below less verbose. + fn kr(irange: Range) -> Range { + Key::from_i128(irange.start)..Key::from_i128(irange.end) + } + + #[allow(dead_code)] + fn dump_keyspace(ks: &KeySpace) { + for r in ks.ranges.iter() { + println!(" {}..{}", r.start.to_i128(), r.end.to_i128()); + } + } + + fn assert_ks_eq(actual: &KeySpace, expected: Vec>) { + if actual.ranges != expected { + let mut msg = String::new(); + + writeln!(msg, "expected:").unwrap(); + for r in &expected { + writeln!(msg, " {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap(); + } + writeln!(msg, "got:").unwrap(); + for r in &actual.ranges { + writeln!(msg, " {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap(); + } + panic!("{}", msg); + } + } + + #[test] + fn keyspace_add_range() { + // two separate ranges + // + // ##### + // ##### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(0..10)); + ks.add_range(kr(20..30)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..10), kr(20..30)]); + + // two separate ranges, added in reverse order + // + // ##### + // ##### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(20..30)); + ks.add_range(kr(0..10)); + + // add range that is adjacent to the end of an existing range + // + // ##### + // ##### + ks.add_range(kr(0..10)); + ks.add_range(kr(10..30)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); + + // add range that is adjacent to the start of an existing range + // + // ##### + // ##### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(10..30)); + ks.add_range(kr(0..10)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); + + // add range that overlaps with the end of an existing range + // + // ##### + // ##### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(0..10)); + ks.add_range(kr(5..30)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); + + // add range that overlaps with the start of an existing range + // + // ##### + // ##### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(5..30)); + ks.add_range(kr(0..10)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); + + // add range that is fully covered by an existing range + // + // ######### + // ##### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(0..30)); + ks.add_range(kr(10..20)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); + + // add range that extends an existing range from both ends + // + // ##### + // ######### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(10..20)); + ks.add_range(kr(0..30)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); + + // add a range that overlaps with two existing ranges, joining them + // + // ##### ##### + // ####### + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(0..10)); + ks.add_range(kr(20..30)); + ks.add_range(kr(5..25)); + assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]); + } + + #[test] + fn keyspace_overlaps() { + let mut ks = KeySpaceRandomAccum::default(); + ks.add_range(kr(10..20)); + ks.add_range(kr(30..40)); + let ks = ks.to_keyspace(); + + // ##### ##### + // xxxx + assert!(!ks.overlaps(&kr(0..5))); + + // ##### ##### + // xxxx + assert!(!ks.overlaps(&kr(5..9))); + + // ##### ##### + // xxxx + assert!(!ks.overlaps(&kr(5..10))); + + // ##### ##### + // xxxx + assert!(ks.overlaps(&kr(5..11))); + + // ##### ##### + // xxxx + assert!(ks.overlaps(&kr(10..15))); + + // ##### ##### + // xxxx + assert!(ks.overlaps(&kr(15..20))); + + // ##### ##### + // xxxx + assert!(ks.overlaps(&kr(15..25))); + + // ##### ##### + // xxxx + assert!(!ks.overlaps(&kr(22..28))); + + // ##### ##### + // xxxx + assert!(!ks.overlaps(&kr(25..30))); + + // ##### ##### + // xxxx + assert!(ks.overlaps(&kr(35..35))); + + // ##### ##### + // xxxx + assert!(!ks.overlaps(&kr(40..45))); + + // ##### ##### + // xxxx + assert!(!ks.overlaps(&kr(45..50))); + + // ##### ##### + // xxxxxxxxxxx + assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently! + } +} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 04863886cb..5831091098 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -1,6 +1,5 @@ mod auth; pub mod basebackup; -pub mod broker_client; pub mod config; pub mod consumption_metrics; pub mod context; @@ -36,7 +35,7 @@ use tracing::info; /// backwards-compatible changes to the metadata format. pub const STORAGE_FORMAT_VERSION: u16 = 3; -pub const DEFAULT_PG_VERSION: u32 = 14; +pub const DEFAULT_PG_VERSION: u32 = 15; // Magic constants used to identify different kinds of files pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; @@ -46,6 +45,7 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; +#[tracing::instrument] pub async fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint task. This prevents new connections from // being accepted. @@ -58,12 +58,6 @@ pub async fn shutdown_pageserver(exit_code: i32) { // the checkpoint and GC tasks. tenant::mgr::shutdown_all_tenants().await; - // Stop syncing with remote storage. - // - // FIXME: Does this wait for the sync tasks to finish syncing what's queued up? - // Should it? - task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), None, None).await; - // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. // FIXME: We should probably stop accepting commands like attach/detach earlier. @@ -138,6 +132,29 @@ pub fn is_uninit_mark(path: &Path) -> bool { } } +/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by +/// blocking. +/// +/// The instances of this value exist only during startup, otherwise `None` is provided, meaning no +/// delaying is needed. +#[derive(Clone)] +pub struct InitializationOrder { + /// Each initial tenant load task carries this until completion. + pub initial_tenant_load: Option, + + /// Barrier for when we can start initial logical size calculations. + pub initial_logical_size_can_start: utils::completion::Barrier, + + /// Each timeline owns a clone of this to be consumed on the initial logical size calculation + /// attempt. It is important to drop this once the attempt has completed. + pub initial_logical_size_attempt: utils::completion::Completion, + + /// Barrier for when we can start any background jobs. + /// + /// This can be broken up later on, but right now there is just one class of a background job. + pub background_jobs_can_start: utils::completion::Barrier, +} + #[cfg(test)] mod backoff_defaults_tests { use super::*; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 75bea9dbab..cc444c479a 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -84,6 +84,16 @@ pub static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static READ_NUM_FS_LAYERS: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_read_num_fs_layers", + "Number of persistent layers accessed for processing a read request, including those in the cache", + &["tenant_id", "timeline_id"], + vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0], + ) + .expect("failed to define a metric") +}); + // Metrics collected on operations on the storage repository. static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( @@ -95,6 +105,25 @@ static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_materialized_cache_hits_direct_total", + "Number of cache hits from materialized page cache without redo", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_getpage_get_reconstruct_data_seconds", + "Time spent in get_reconstruct_value_data", + &["tenant_id", "timeline_id"], + CRITICAL_OP_BUCKETS.into(), + ) + .expect("failed to define a metric") +}); + static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_materialized_cache_hits_total", @@ -354,6 +383,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ 0.001000, // 1000 usec 0.030, // 30 ms 1.000, // 1000 ms + 30.000, // 30000 ms ]; const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[ @@ -622,7 +652,7 @@ pub static WAL_REDO_TIME: Lazy = Lazy::new(|| { pub static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_wait_seconds", - "Time spent waiting for access to the WAL redo process", + "Time spent waiting for access to the Postgres WAL redo process", redo_histogram_time_buckets!(), ) .expect("failed to define a metric") @@ -631,7 +661,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_records_histogram", - "Histogram of number of records replayed per redo", + "Histogram of number of records replayed per redo in the Postgres WAL redo process", redo_histogram_count_buckets!(), ) .expect("failed to define a metric") @@ -640,7 +670,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { pub static WAL_REDO_BYTES_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_bytes_histogram", - "Histogram of number of records replayed per redo", + "Histogram of number of records replayed per redo sent to Postgres", redo_bytes_histogram_count_buckets!(), ) .expect("failed to define a metric") @@ -723,7 +753,9 @@ pub struct TimelineMetrics { tenant_id: String, timeline_id: String, pub reconstruct_time_histo: Histogram, + pub get_reconstruct_data_time_histo: Histogram, pub materialized_page_cache_hit_counter: GenericCounter, + pub materialized_page_cache_hit_upon_request_counter: GenericCounter, pub flush_time_histo: StorageTimeMetrics, pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, @@ -734,6 +766,7 @@ pub struct TimelineMetrics { pub last_record_gauge: IntGauge, pub wait_lsn_time_histo: Histogram, pub resident_physical_size_gauge: UIntGauge, + pub read_num_fs_layers: Histogram, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub num_persistent_files_created: IntCounter, @@ -753,6 +786,9 @@ impl TimelineMetrics { let reconstruct_time_histo = RECONSTRUCT_TIME .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); + let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); @@ -794,6 +830,12 @@ impl TimelineMetrics { let evictions = EVICTIONS .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); + let read_num_fs_layers = READ_NUM_FS_LAYERS + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id); @@ -801,7 +843,9 @@ impl TimelineMetrics { tenant_id, timeline_id, reconstruct_time_histo, + get_reconstruct_data_time_histo, materialized_page_cache_hit_counter, + materialized_page_cache_hit_upon_request_counter, flush_time_histo, compact_time_histo, create_images_time_histo, @@ -819,6 +863,7 @@ impl TimelineMetrics { evictions_with_low_residence_duration: std::sync::RwLock::new( evictions_with_low_residence_duration, ), + read_num_fs_layers, } } } @@ -828,7 +873,9 @@ impl Drop for TimelineMetrics { let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]); + let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]); let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]); + let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]); let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]); let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); @@ -836,6 +883,8 @@ impl Drop for TimelineMetrics { let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]); + let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]); + self.evictions_with_low_residence_duration .write() .unwrap() diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index bd3ece2dfc..9e9285a009 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -50,7 +50,9 @@ use crate::import_datadir::import_wal_from_tar; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::task_mgr; use crate::task_mgr::TaskKind; +use crate::tenant; use crate::tenant::mgr; +use crate::tenant::mgr::GetTenantError; use crate::tenant::{Tenant, Timeline}; use crate::trace::Tracer; @@ -172,6 +174,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<() /// pub async fn libpq_listener_main( conf: &'static PageServerConf, + broker_client: storage_broker::BrokerClientChannel, auth: Option>, listener: TcpListener, auth_type: AuthType, @@ -213,7 +216,14 @@ pub async fn libpq_listener_main( None, "serving compute connection task", false, - page_service_conn_main(conf, local_auth, socket, auth_type, connection_ctx), + page_service_conn_main( + conf, + broker_client.clone(), + local_auth, + socket, + auth_type, + connection_ctx, + ), ); } Err(err) => { @@ -230,6 +240,7 @@ pub async fn libpq_listener_main( async fn page_service_conn_main( conf: &'static PageServerConf, + broker_client: storage_broker::BrokerClientChannel, auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, @@ -266,7 +277,7 @@ async fn page_service_conn_main( // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. - let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx); + let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx); let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; match pgbackend @@ -324,6 +335,7 @@ impl PageRequestMetrics { struct PageServerHandler { _conf: &'static PageServerConf, + broker_client: storage_broker::BrokerClientChannel, auth: Option>, claims: Option, @@ -337,11 +349,13 @@ struct PageServerHandler { impl PageServerHandler { pub fn new( conf: &'static PageServerConf, + broker_client: storage_broker::BrokerClientChannel, auth: Option>, connection_ctx: RequestContext, ) -> Self { PageServerHandler { _conf: conf, + broker_client, auth, claims: None, connection_ctx, @@ -494,7 +508,12 @@ impl PageServerHandler { let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb))); timeline - .import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx) + .import_basebackup_from_tar( + &mut copyin_reader, + base_lsn, + self.broker_client.clone(), + &ctx, + ) .await?; // Read the end of the tar archive. @@ -1131,7 +1150,9 @@ enum GetActiveTenantError { wait_time: Duration, }, #[error(transparent)] - Other(#[from] anyhow::Error), + NotFound(GetTenantError), + #[error(transparent)] + WaitTenantActive(tenant::WaitToBecomeActiveError), } impl From for QueryError { @@ -1140,7 +1161,8 @@ impl From for QueryError { GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected( ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())), ), - GetActiveTenantError::Other(e) => QueryError::Other(e), + GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)), + GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)), } } } @@ -1156,13 +1178,16 @@ async fn get_active_tenant_with_timeout( ) -> Result, GetActiveTenantError> { let tenant = match mgr::get_tenant(tenant_id, false).await { Ok(tenant) => tenant, - Err(e) => return Err(GetActiveTenantError::Other(e.into())), + Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)), + Err(GetTenantError::NotActive(_)) => { + unreachable!("we're calling get_tenant with active=false") + } }; let wait_time = Duration::from_secs(30); match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await { Ok(Ok(())) => Ok(tenant), // no .context(), the error message is good enough and some tests depend on it - Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)), + Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)), Err(_) => { let latest_state = tenant.current_state(); if latest_state == TenantState::Active { @@ -1177,13 +1202,34 @@ async fn get_active_tenant_with_timeout( } } +#[derive(Debug, thiserror::Error)] +enum GetActiveTimelineError { + #[error(transparent)] + Tenant(GetActiveTenantError), + #[error(transparent)] + Timeline(anyhow::Error), +} + +impl From for QueryError { + fn from(e: GetActiveTimelineError) -> Self { + match e { + GetActiveTimelineError::Tenant(e) => e.into(), + GetActiveTimelineError::Timeline(e) => QueryError::Other(e), + } + } +} + /// Shorthand for getting a reference to a Timeline of an Active tenant. async fn get_active_tenant_timeline( tenant_id: TenantId, timeline_id: TimelineId, ctx: &RequestContext, -) -> Result, GetActiveTenantError> { - let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?; - let timeline = tenant.get_timeline(timeline_id, true)?; +) -> Result, GetActiveTimelineError> { + let tenant = get_active_tenant_with_timeout(tenant_id, ctx) + .await + .map_err(GetActiveTimelineError::Tenant)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(GetActiveTimelineError::Timeline)?; Ok(timeline) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 67f37ee519..186209dfcf 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1600,9 +1600,7 @@ pub fn create_test_timeline( pg_version: u32, ctx: &RequestContext, ) -> anyhow::Result> { - let tline = tenant - .create_empty_timeline(timeline_id, Lsn(8), pg_version, ctx)? - .initialize(ctx)?; + let tline = tenant.create_test_timeline(timeline_id, Lsn(8), pg_version, ctx)?; let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; m.commit()?; diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 82aebc6c07..d8db12a113 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -257,6 +257,9 @@ pub enum TaskKind { // task that handles attaching a tenant Attach, + // Used mostly for background deletion from s3 + TimelineDeletionWorker, + // task that handhes metrics collection MetricsCollection, @@ -476,18 +479,35 @@ pub async fn shutdown_tasks( && (timeline_id.is_none() || task_mut.timeline_id == timeline_id) { task.cancel.cancel(); - victim_tasks.push(Arc::clone(task)); + victim_tasks.push(( + Arc::clone(task), + task.kind, + task_mut.tenant_id, + task_mut.timeline_id, + )); } } } - for task in victim_tasks { + let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none(); + + for (task, task_kind, tenant_id, timeline_id) in victim_tasks { let join_handle = { let mut task_mut = task.mutable.lock().unwrap(); task_mut.join_handle.take() }; if let Some(mut join_handle) = join_handle { + if log_all { + if tenant_id.is_none() { + // there are quite few of these + info!(name = task.name, kind = ?task_kind, "stopping global task"); + } else { + // warn to catch these in tests; there shouldn't be any + warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); + } + } let completed = tokio::select! { + biased; _ = &mut join_handle => { true }, _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => { // allow some time to elapse before logging to cut down the number of log diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 8349e1993f..4beb2664a5 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -16,9 +16,12 @@ use futures::FutureExt; use pageserver_api::models::TimelineState; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; +use storage_broker::BrokerClientChannel; use tokio::sync::watch; +use tokio::sync::OwnedMutexGuard; use tokio::task::JoinSet; use tracing::*; +use utils::completion; use utils::crashsafe::path_with_suffix_extension; use std::cmp::min; @@ -63,6 +66,7 @@ use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; use crate::tenant::storage_layer::Layer; +use crate::InitializationOrder; use crate::virtual_file::VirtualFile; use crate::walredo::PostgresRedoManager; @@ -77,11 +81,12 @@ use utils::{ lsn::{Lsn, RecordLsn}, }; -mod blob_io; +pub mod blob_io; pub mod block_io; pub mod disk_btree; pub(crate) mod ephemeral_file; pub mod layer_map; +pub mod manifest; pub mod metadata; mod par_fsync; @@ -184,24 +189,14 @@ impl UninitializedTimeline<'_> { /// Ensures timeline data is valid, loads it into pageserver's memory and removes /// uninit mark file on success. /// - /// The new timeline is initialized in Active state, and its background jobs are - /// started - pub fn initialize(self, ctx: &RequestContext) -> anyhow::Result> { - let mut timelines = self.owning_tenant.timelines.lock().unwrap(); - self.initialize_with_lock(ctx, &mut timelines, true, true) - } - - /// Like `initialize`, but the caller is already holding lock on Tenant::timelines. - /// If `launch_wal_receiver` is false, the WAL receiver not launched, even though - /// timeline is initialized in Active state. This is used during tenant load and - /// attach, where the WAL receivers are launched only after all the timelines have - /// been initialized. + /// This function launches the flush loop if not already done. + /// + /// The caller is responsible for activating the timeline (function `.activate()`). fn initialize_with_lock( mut self, - ctx: &RequestContext, + _ctx: &RequestContext, timelines: &mut HashMap>, load_layer_map: bool, - activate: bool, ) -> anyhow::Result> { let timeline_id = self.timeline_id; let tenant_id = self.owning_tenant.tenant_id; @@ -237,12 +232,6 @@ impl UninitializedTimeline<'_> { v.insert(Arc::clone(&new_timeline)); new_timeline.maybe_spawn_flush_loop(); - - if activate { - new_timeline - .activate(ctx) - .context("initializing timeline activation")?; - } } } @@ -254,6 +243,7 @@ impl UninitializedTimeline<'_> { self, copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin), base_lsn: Lsn, + broker_client: storage_broker::BrokerClientChannel, ctx: &RequestContext, ) -> anyhow::Result> { let raw_timeline = self.raw_timeline()?; @@ -279,7 +269,9 @@ impl UninitializedTimeline<'_> { // Initialize without loading the layer map. We started with an empty layer map, and already // updated it for the layers that we created during the import. let mut timelines = self.owning_tenant.timelines.lock().unwrap(); - self.initialize_with_lock(ctx, &mut timelines, false, true) + let tl = self.initialize_with_lock(ctx, &mut timelines, false)?; + tl.activate(broker_client, None, ctx); + Ok(tl) } fn raw_timeline(&self) -> anyhow::Result<&Arc> { @@ -454,16 +446,53 @@ pub enum DeleteTimelineError { #[error("NotFound")] NotFound, #[error("HasChildren")] - HasChildren, + HasChildren(Vec), #[error(transparent)] Other(#[from] anyhow::Error), } +pub enum SetStoppingError { + AlreadyStopping, + Broken, +} + struct RemoteStartupData { index_part: IndexPart, remote_metadata: TimelineMetadata, } +#[derive(Debug, thiserror::Error)] +pub(crate) enum WaitToBecomeActiveError { + WillNotBecomeActive { + tenant_id: TenantId, + state: TenantState, + }, + TenantDropped { + tenant_id: TenantId, + }, +} + +impl std::fmt::Display for WaitToBecomeActiveError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + WaitToBecomeActiveError::WillNotBecomeActive { tenant_id, state } => { + write!( + f, + "Tenant {} will not become active. Current state: {:?}", + tenant_id, state + ) + } + WaitToBecomeActiveError::TenantDropped { tenant_id } => { + write!(f, "Tenant {tenant_id} will not become active (dropped)") + } + } + } +} + +pub(crate) enum ShutdownError { + AlreadyStopping, +} + impl Tenant { /// Yet another helper for timeline initialization. /// Contains the common part of `load_local_timeline` and `load_remote_timeline`. @@ -484,6 +513,7 @@ impl Tenant { local_metadata: Option, ancestor: Option>, first_save: bool, + init_order: Option<&InitializationOrder>, ctx: &RequestContext, ) -> anyhow::Result<()> { let tenant_id = self.tenant_id; @@ -509,6 +539,7 @@ impl Tenant { up_to_date_metadata, ancestor.clone(), remote_client, + init_order, )?; let timeline = UninitializedTimeline { @@ -519,7 +550,7 @@ impl Tenant { // Do not start walreceiver here. We do need loaded layer map for reconcile_with_remote // But we shouldnt start walreceiver before we have all the data locally, because working walreceiver // will ingest data which may require looking at the layers which are not yet available locally - match timeline.initialize_with_lock(ctx, &mut timelines_accessor, true, false) { + match timeline.initialize_with_lock(ctx, &mut timelines_accessor, true) { Ok(new_timeline) => new_timeline, Err(e) => { error!("Failed to initialize timeline {tenant_id}/{timeline_id}: {e:?}"); @@ -534,11 +565,12 @@ impl Tenant { up_to_date_metadata, ancestor.clone(), None, + None, ) .with_context(|| { format!("creating broken timeline data for {tenant_id}/{timeline_id}") })?; - broken_timeline.set_state(TimelineState::Broken); + broken_timeline.set_broken(e.to_string()); timelines_accessor.insert(timeline_id, broken_timeline); return Err(e); } @@ -599,6 +631,7 @@ impl Tenant { pub(crate) fn spawn_attach( conf: &'static PageServerConf, tenant_id: TenantId, + broker_client: storage_broker::BrokerClientChannel, remote_storage: GenericRemoteStorage, ctx: &RequestContext, ) -> anyhow::Result> { @@ -628,15 +661,26 @@ impl Tenant { "attach tenant", false, async move { - match tenant_clone.attach(ctx).await { - Ok(_) => {} + match tenant_clone.attach(&ctx).await { + Ok(()) => { + info!("attach finished, activating"); + tenant_clone.activate(broker_client, None, &ctx); + } Err(e) => { - tenant_clone.set_broken(e.to_string()); - error!("error attaching tenant: {:?}", e); + error!("attach failed, setting tenant state to Broken: {:?}", e); + tenant_clone.state.send_modify(|state| { + assert_eq!(*state, TenantState::Attaching, "the attach task owns the tenant state until activation is complete"); + *state = TenantState::broken_from_reason(e.to_string()); + }); } } Ok(()) - }, + } + .instrument({ + let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_id); + span.follows_from(Span::current()); + span + }), ); Ok(tenant) } @@ -644,8 +688,11 @@ impl Tenant { /// /// Background task that downloads all data for a tenant and brings it to Active state. /// - #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] - async fn attach(self: &Arc, ctx: RequestContext) -> anyhow::Result<()> { + /// No background tasks are started as part of this routine. + /// + async fn attach(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_id(); + let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id); if !tokio::fs::try_exists(&marker_file) .await @@ -718,7 +765,7 @@ impl Tenant { ); remote_index_and_client.insert(timeline_id, (index_part, client)); } - MaybeDeletedIndexPart::Deleted => { + MaybeDeletedIndexPart::Deleted(_) => { info!("timeline {} is deleted, skipping", timeline_id); continue; } @@ -735,20 +782,14 @@ impl Tenant { .expect("just put it in above"); // TODO again handle early failure - self.load_remote_timeline( - timeline_id, - index_part, - remote_metadata, - remote_client, - &ctx, - ) - .await - .with_context(|| { - format!( - "failed to load remote timeline {} for tenant {}", - timeline_id, self.tenant_id - ) - })?; + self.load_remote_timeline(timeline_id, index_part, remote_metadata, remote_client, ctx) + .await + .with_context(|| { + format!( + "failed to load remote timeline {} for tenant {}", + timeline_id, self.tenant_id + ) + })?; } std::fs::remove_file(&marker_file) @@ -758,10 +799,6 @@ impl Tenant { utils::failpoint_sleep_millis_async!("attach-before-activate"); - // Start background operations and open the tenant for business. - // The loops will shut themselves down when they notice that the tenant is inactive. - self.activate(&ctx)?; - info!("Done"); Ok(()) @@ -827,6 +864,7 @@ impl Tenant { local_metadata, ancestor, true, + None, ctx, ) .await @@ -852,7 +890,6 @@ impl Tenant { )) } - /// /// Load a tenant that's available on local disk /// /// This is used at pageserver startup, to rebuild the in-memory @@ -862,14 +899,17 @@ impl Tenant { /// /// If the loading fails for some reason, the Tenant will go into Broken /// state. - /// - #[instrument(skip(conf, remote_storage, ctx), fields(tenant_id=%tenant_id))] + #[instrument(skip_all, fields(tenant_id=%tenant_id))] pub fn spawn_load( conf: &'static PageServerConf, tenant_id: TenantId, + broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, + init_order: Option, ctx: &RequestContext, ) -> Arc { + debug_assert_current_span_has_tenant_id(); + let tenant_conf = match Self::load_tenant_config(conf, tenant_id) { Ok(conf) => conf, Err(e) => { @@ -901,20 +941,35 @@ impl Tenant { "initial tenant load", false, async move { - match tenant_clone.load(&ctx).await { - Ok(()) => {} + let mut init_order = init_order; + + // take the completion because initial tenant loading will complete when all of + // these tasks complete. + let _completion = init_order.as_mut().and_then(|x| x.initial_tenant_load.take()); + + match tenant_clone.load(init_order.as_ref(), &ctx).await { + Ok(()) => { + debug!("load finished, activating"); + let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start); + tenant_clone.activate(broker_client, background_jobs_can_start, &ctx); + } Err(err) => { - tenant_clone.set_broken(err.to_string()); - error!("could not load tenant {tenant_id}: {err:?}"); + error!("load failed, setting tenant state to Broken: {err:?}"); + tenant_clone.state.send_modify(|state| { + assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete"); + *state = TenantState::broken_from_reason(err.to_string()); + }); } } - info!("initial load for tenant {tenant_id} finished!"); - Ok(()) - }, + Ok(()) + } + .instrument({ + let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id); + span.follows_from(Span::current()); + span + }), ); - info!("spawned load into background"); - tenant } @@ -922,9 +977,15 @@ impl Tenant { /// Background task to load in-memory data structures for this tenant, from /// files on disk. Used at pageserver startup. /// - #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))] - async fn load(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { - info!("loading tenant task"); + /// No background tasks are started as part of this routine. + async fn load( + self: &Arc, + init_order: Option<&InitializationOrder>, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_id(); + + debug!("loading tenant task"); utils::failpoint_sleep_millis_async!("before-loading-tenant"); @@ -934,116 +995,119 @@ impl Tenant { // // Scan the directory, peek into the metadata file of each timeline, and // collect a list of timelines and their ancestors. - let mut timelines_to_load: HashMap = HashMap::new(); - let timelines_dir = self.conf.timelines_path(&self.tenant_id); - for entry in std::fs::read_dir(&timelines_dir).with_context(|| { - format!( - "Failed to list timelines directory for tenant {}", - self.tenant_id - ) - })? { - let entry = entry.with_context(|| { - format!("cannot read timeline dir entry for {}", self.tenant_id) - })?; - let timeline_dir = entry.path(); + let tenant_id = self.tenant_id; + let conf = self.conf; + let span = info_span!("blocking"); - if crate::is_temporary(&timeline_dir) { - info!( - "Found temporary timeline directory, removing: {}", - timeline_dir.display() - ); - if let Err(e) = std::fs::remove_dir_all(&timeline_dir) { - error!( - "Failed to remove temporary directory '{}': {:?}", - timeline_dir.display(), - e + let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || { + let _g = span.entered(); + let mut timelines_to_load: HashMap = HashMap::new(); + let timelines_dir = conf.timelines_path(&tenant_id); + + for entry in + std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")? + { + let entry = entry.context("read timeline dir entry")?; + let timeline_dir = entry.path(); + + if crate::is_temporary(&timeline_dir) { + info!( + "Found temporary timeline directory, removing: {}", + timeline_dir.display() ); - } - } else if is_uninit_mark(&timeline_dir) { - let timeline_uninit_mark_file = &timeline_dir; - info!( - "Found an uninit mark file {}, removing the timeline and its uninit mark", - timeline_uninit_mark_file.display() - ); - let timeline_id = timeline_uninit_mark_file - .file_stem() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .with_context(|| { - format!( + if let Err(e) = std::fs::remove_dir_all(&timeline_dir) { + error!( + "Failed to remove temporary directory '{}': {:?}", + timeline_dir.display(), + e + ); + } + } else if is_uninit_mark(&timeline_dir) { + let timeline_uninit_mark_file = &timeline_dir; + info!( + "Found an uninit mark file {}, removing the timeline and its uninit mark", + timeline_uninit_mark_file.display() + ); + let timeline_id = timeline_uninit_mark_file + .file_stem() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( "Could not parse timeline id out of the timeline uninit mark name {}", timeline_uninit_mark_file.display() ) - })?; - let timeline_dir = self.conf.timeline_path(&timeline_id, &self.tenant_id); - if let Err(e) = - remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) - { - error!("Failed to clean up uninit marked timeline: {e:?}"); - } - } else { - let timeline_id = timeline_dir - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline dir name {}", - timeline_dir.display() - ) - })?; - let timeline_uninit_mark_file = self - .conf - .timeline_uninit_mark_file_path(self.tenant_id, timeline_id); - if timeline_uninit_mark_file.exists() { - info!( - "Found an uninit mark file for timeline {}/{}, removing the timeline and its uninit mark", - self.tenant_id, timeline_id - ); + })?; + let timeline_dir = conf.timeline_path(&timeline_id, &tenant_id); if let Err(e) = - remove_timeline_and_uninit_mark(&timeline_dir, &timeline_uninit_mark_file) + remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) { error!("Failed to clean up uninit marked timeline: {e:?}"); } - continue; - } - - let file_name = entry.file_name(); - if let Ok(timeline_id) = - file_name.to_str().unwrap_or_default().parse::() - { - let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) - .context("failed to load metadata")?; - timelines_to_load.insert(timeline_id, metadata); } else { - // A file or directory that doesn't look like a timeline ID - warn!( - "unexpected file or directory in timelines directory: {}", - file_name.to_string_lossy() - ); + let timeline_id = timeline_dir + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( + "Could not parse timeline id out of the timeline dir name {}", + timeline_dir.display() + ) + })?; + let timeline_uninit_mark_file = + conf.timeline_uninit_mark_file_path(tenant_id, timeline_id); + if timeline_uninit_mark_file.exists() { + info!( + %timeline_id, + "Found an uninit mark file, removing the timeline and its uninit mark", + ); + if let Err(e) = remove_timeline_and_uninit_mark( + &timeline_dir, + &timeline_uninit_mark_file, + ) { + error!("Failed to clean up uninit marked timeline: {e:?}"); + } + continue; + } + + let file_name = entry.file_name(); + if let Ok(timeline_id) = + file_name.to_str().unwrap_or_default().parse::() + { + let metadata = load_metadata(conf, timeline_id, tenant_id) + .context("failed to load metadata")?; + timelines_to_load.insert(timeline_id, metadata); + } else { + // A file or directory that doesn't look like a timeline ID + warn!( + "unexpected file or directory in timelines directory: {}", + file_name.to_string_lossy() + ); + } } } - } - // Sort the array of timeline IDs into tree-order, so that parent comes before - // all its children. - let sorted_timelines = tree_sort_timelines(timelines_to_load)?; + // Sort the array of timeline IDs into tree-order, so that parent comes before + // all its children. + tree_sort_timelines(timelines_to_load) + }) + .await + .context("load spawn_blocking") + .and_then(|res| res)?; + // FIXME original collect_timeline_files contained one more check: // 1. "Timeline has no ancestor and no layer files" for (timeline_id, local_metadata) in sorted_timelines { - self.load_local_timeline(timeline_id, local_metadata, ctx) + self.load_local_timeline(timeline_id, local_metadata, init_order, ctx) .await .with_context(|| format!("load local timeline {timeline_id}"))?; } - // Start background operations and open the tenant for business. - // The loops will shut themselves down when they notice that the tenant is inactive. - self.activate(ctx)?; - - info!("Done"); + trace!("Done"); Ok(()) } @@ -1051,11 +1115,12 @@ impl Tenant { /// Subroutine of `load_tenant`, to load an individual timeline /// /// NB: The parent is assumed to be already loaded! - #[instrument(skip_all, fields(timeline_id))] + #[instrument(skip(self, local_metadata, init_order, ctx))] async fn load_local_timeline( - &self, + self: &Arc, timeline_id: TimelineId, local_metadata: TimelineMetadata, + init_order: Option<&InitializationOrder>, ctx: &RequestContext, ) -> anyhow::Result<()> { debug_assert_current_span_has_tenant_id(); @@ -1069,12 +1134,20 @@ impl Tenant { ) }); - let remote_startup_data = match &remote_client { + let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { + let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) + .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?; + Some(ancestor_timeline) + } else { + None + }; + + let (remote_startup_data, remote_client) = match remote_client { Some(remote_client) => match remote_client.download_index_file().await { Ok(index_part) => { let index_part = match index_part { MaybeDeletedIndexPart::IndexPart(index_part) => index_part, - MaybeDeletedIndexPart::Deleted => { + MaybeDeletedIndexPart::Deleted(index_part) => { // TODO: we won't reach here if remote storage gets de-configured after start of the deletion operation. // Example: // start deletion operation @@ -1085,37 +1158,59 @@ impl Tenant { // // We don't really anticipate remote storage to be de-configured, so, for now, this is fine. // Also, maybe we'll remove that option entirely in the future, see https://github.com/neondatabase/neon/issues/4099. - info!("is_deleted is set on remote, resuming removal of local data originally done by timeline deletion handler"); - std::fs::remove_dir_all( - self.conf.timeline_path(&timeline_id, &self.tenant_id), - ) - .context("remove_dir_all")?; + info!("is_deleted is set on remote, resuming removal of timeline data originally done by timeline deletion handler"); + + remote_client + .init_upload_queue_stopped_to_continue_deletion(&index_part)?; + + let timeline = self + .create_timeline_data( + timeline_id, + &local_metadata, + ancestor, + Some(remote_client), + init_order, + ) + .context("create_timeline_data")?; + + let guard = Arc::clone(&timeline.delete_lock).lock_owned().await; + + // Note: here we even skip populating layer map. Timeline is essentially uninitialized. + // RemoteTimelineClient is the only functioning part. + timeline.set_state(TimelineState::Stopping); + // We meed to do this because when console retries delete request we shouldnt answer with 404 + // because 404 means successful deletion. + // FIXME consider TimelineState::Deleting. + let mut locked = self.timelines.lock().unwrap(); + locked.insert(timeline_id, Arc::clone(&timeline)); + + Tenant::schedule_delete_timeline( + Arc::clone(self), + timeline_id, + timeline, + guard, + ); return Ok(()); } }; let remote_metadata = index_part.parse_metadata().context("parse_metadata")?; - Some(RemoteStartupData { - index_part, - remote_metadata, - }) + ( + Some(RemoteStartupData { + index_part, + remote_metadata, + }), + Some(remote_client), + ) } Err(DownloadError::NotFound) => { info!("no index file was found on the remote"); - None + (None, Some(remote_client)) } Err(e) => return Err(anyhow::anyhow!(e)), }, - None => None, - }; - - let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { - let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) - .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?; - Some(ancestor_timeline) - } else { - None + None => (None, remote_client), }; self.timeline_init_and_sync( @@ -1125,6 +1220,7 @@ impl Tenant { Some(local_metadata), ancestor, false, + init_order, ctx, ) .await @@ -1206,6 +1302,27 @@ impl Tenant { ) } + /// Helper for unit tests to create an emtpy timeline. + /// + /// The timeline is has state value `Active` but its background loops are not running. + // This makes the various functions which anyhow::ensure! for Active state work in tests. + // Our current tests don't need the background loops. + #[cfg(test)] + pub fn create_test_timeline( + &self, + new_timeline_id: TimelineId, + initdb_lsn: Lsn, + pg_version: u32, + ctx: &RequestContext, + ) -> anyhow::Result> { + let uninit_tl = self.create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)?; + let mut timelines = self.timelines.lock().unwrap(); + let tl = uninit_tl.initialize_with_lock(ctx, &mut timelines, true)?; + // The non-test code would call tl.activate() here. + tl.set_state(TimelineState::Active); + Ok(tl) + } + /// Create a new timeline. /// /// Returns the new timeline ID and reference to its Timeline object. @@ -1219,6 +1336,7 @@ impl Tenant { ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, pg_version: u32, + broker_client: storage_broker::BrokerClientChannel, ctx: &RequestContext, ) -> anyhow::Result>> { anyhow::ensure!( @@ -1285,6 +1403,8 @@ impl Tenant { } }; + loaded_timeline.activate(broker_client, None, ctx); + if let Some(remote_client) = loaded_timeline.remote_client.as_ref() { // Wait for the upload of the 'index_part.json` file to finish, so that when we return // Ok, the timeline is durable in remote storage. @@ -1319,6 +1439,7 @@ impl Tenant { pitr: Duration, ctx: &RequestContext, ) -> anyhow::Result { + // there is a global allowed_error for this anyhow::ensure!( self.is_active(), "Cannot run GC iteration on inactive tenant" @@ -1362,127 +1483,81 @@ impl Tenant { Ok(()) } - /// Flush all in-memory data to disk. + /// Flush all in-memory data to disk and remote storage, if any. /// /// Used at graceful shutdown. - /// - pub async fn freeze_and_flush(&self) -> anyhow::Result<()> { - // Scan through the hashmap and collect a list of all the timelines, - // while holding the lock. Then drop the lock and actually perform the - // flushing. We don't want to block everything else while the - // flushing is performed. - let timelines_to_flush = { + async fn freeze_and_flush_on_shutdown(&self) { + let mut js = tokio::task::JoinSet::new(); + + // execute on each timeline on the JoinSet, join after. + let per_timeline = |timeline_id: TimelineId, timeline: Arc| { + async move { + debug_assert_current_span_has_tenant_and_timeline_id(); + + match timeline.freeze_and_flush().await { + Ok(()) => {} + Err(e) => { + warn!("failed to freeze and flush: {e:#}"); + return; + } + } + + let res = if let Some(client) = timeline.remote_client.as_ref() { + // if we did not wait for completion here, it might be our shutdown process + // didn't wait for remote uploads to complete at all, as new tasks can forever + // be spawned. + // + // what is problematic is the shutting down of RemoteTimelineClient, because + // obviously it does not make sense to stop while we wait for it, but what + // about corner cases like s3 suddenly hanging up? + client.wait_completion().await + } else { + Ok(()) + }; + + if let Err(e) = res { + warn!("failed to await for frozen and flushed uploads: {e:#}"); + } + } + .instrument(tracing::info_span!("freeze_and_flush_on_shutdown", %timeline_id)) + }; + + { let timelines = self.timelines.lock().unwrap(); timelines .iter() - .map(|(_id, timeline)| Arc::clone(timeline)) - .collect::>() + .map(|(id, tl)| (*id, Arc::clone(tl))) + .for_each(|(timeline_id, timeline)| { + js.spawn(per_timeline(timeline_id, timeline)); + }) }; - for timeline in &timelines_to_flush { - timeline.freeze_and_flush().await?; - } - - Ok(()) - } - - /// Removes timeline-related in-memory data - pub async fn delete_timeline( - &self, - timeline_id: TimelineId, - _ctx: &RequestContext, - ) -> Result<(), DeleteTimelineError> { - timeline::debug_assert_current_span_has_tenant_and_timeline_id(); - - // Transition the timeline into TimelineState::Stopping. - // This should prevent new operations from starting. - let timeline = { - let mut timelines = self.timelines.lock().unwrap(); - - // Ensure that there are no child timelines **attached to that pageserver**, - // because detach removes files, which will break child branches - let children_exist = timelines - .iter() - .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); - - if children_exist { - return Err(DeleteTimelineError::HasChildren); - } - - let timeline_entry = match timelines.entry(timeline_id) { - Entry::Occupied(e) => e, - Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound), - }; - - let timeline = Arc::clone(timeline_entry.get()); - timeline.set_state(TimelineState::Stopping); - - drop(timelines); - timeline - }; - - // Now that the Timeline is in Stopping state, request all the related tasks to - // shut down. - // - // NB: If you call delete_timeline multiple times concurrently, they will - // all go through the motions here. Make sure the code here is idempotent, - // and don't error out if some of the shutdown tasks have already been - // completed! - - // Stop the walreceiver first. - debug!("waiting for wal receiver to shutdown"); - timeline.walreceiver.stop().await; - debug!("wal receiver shutdown confirmed"); - - // Prevent new uploads from starting. - if let Some(remote_client) = timeline.remote_client.as_ref() { - let res = remote_client.stop(); + while let Some(res) = js.join_next().await { match res { Ok(()) => {} - Err(e) => match e { - remote_timeline_client::StopError::QueueUninitialized => { - // This case shouldn't happen currently because the - // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart. - // That is, before we declare the Tenant as Active. - // But we only allow calls to delete_timeline on Active tenants. - return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs"))); - } - }, - } - } - - // Stop & wait for the remaining timeline tasks, including upload tasks. - // NB: This and other delete_timeline calls do not run as a task_mgr task, - // so, they are not affected by this shutdown_tasks() call. - info!("waiting for timeline tasks to shutdown"); - task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await; - - // Mark timeline as deleted in S3 so we won't pick it up next time - // during attach or pageserver restart. - // See comment in persist_index_part_with_deleted_flag. - if let Some(remote_client) = timeline.remote_client.as_ref() { - match remote_client.persist_index_part_with_deleted_flag().await { - // If we (now, or already) marked it successfully as deleted, we can proceed - Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), - // Bail out otherwise - Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) - | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { - return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); - } + Err(je) if je.is_cancelled() => unreachable!("no cancelling used"), + Err(je) if je.is_panic() => { /* logged already */ } + Err(je) => warn!("unexpected JoinError: {je:?}"), } } + } + /// Shuts down a timeline's tasks, removes its in-memory structures, and deletes its + /// data from both disk and s3. + async fn delete_timeline( + &self, + timeline_id: TimelineId, + timeline: Arc, + ) -> anyhow::Result<()> { { // Grab the layer_removal_cs lock, and actually perform the deletion. // - // This lock prevents multiple concurrent delete_timeline calls from - // stepping on each other's toes, while deleting the files. It also - // prevents GC or compaction from running at the same time. + // This lock prevents prevents GC or compaction from running at the same time. + // The GC task doesn't register itself with the timeline it's operating on, + // so it might still be running even though we called `shutdown_tasks`. // // Note that there are still other race conditions between - // GC, compaction and timeline deletion. GC task doesn't - // register itself properly with the timeline it's - // operating on. See + // GC, compaction and timeline deletion. See // https://github.com/neondatabase/neon/issues/2671 // // No timeout here, GC & Compaction should be responsive to the @@ -1494,7 +1569,9 @@ impl Tenant { // NB: storage_sync upload tasks that reference these layers have been cancelled // by the caller. - let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id); + let local_timeline_directory = self + .conf + .timeline_path(&timeline.timeline_id, &self.tenant_id); fail::fail_point!("timeline-delete-before-rm", |_| { Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))? @@ -1543,42 +1620,197 @@ impl Tenant { Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))? }); - // Remove the timeline from the map. - let mut timelines = self.timelines.lock().unwrap(); - let children_exist = timelines - .iter() - .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); - // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. - // We already deleted the layer files, so it's probably best to panic. - // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) - if children_exist { - panic!("Timeline grew children while we removed layer files"); + { + // Remove the timeline from the map. + let mut timelines = self.timelines.lock().unwrap(); + let children_exist = timelines + .iter() + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); + // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. + // We already deleted the layer files, so it's probably best to panic. + // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) + if children_exist { + panic!("Timeline grew children while we removed layer files"); + } + + timelines.remove(&timeline_id).expect( + "timeline that we were deleting was concurrently removed from 'timelines' map", + ); + + drop(timelines); } - let removed_timeline = timelines.remove(&timeline_id); - if removed_timeline.is_none() { - // This can legitimately happen if there's a concurrent call to this function. - // T1 T2 - // lock - // unlock - // lock - // unlock - // remove files - // lock - // remove from map - // unlock - // return - // remove files - // lock - // remove from map observes empty map - // unlock - // return - debug!("concurrent call to this function won the race"); - } - drop(timelines); + + let remote_client = match &timeline.remote_client { + Some(remote_client) => remote_client, + None => return Ok(()), + }; + + remote_client.delete_all().await?; Ok(()) } + /// Removes timeline-related in-memory data and schedules removal from remote storage. + #[instrument(skip(self, _ctx))] + pub async fn prepare_and_schedule_delete_timeline( + self: Arc, + timeline_id: TimelineId, + _ctx: &RequestContext, + ) -> Result<(), DeleteTimelineError> { + timeline::debug_assert_current_span_has_tenant_and_timeline_id(); + + // Transition the timeline into TimelineState::Stopping. + // This should prevent new operations from starting. + // + // Also grab the Timeline's delete_lock to prevent another deletion from starting. + let timeline; + let delete_lock_guard; + { + let mut timelines = self.timelines.lock().unwrap(); + + // Ensure that there are no child timelines **attached to that pageserver**, + // because detach removes files, which will break child branches + let children: Vec = timelines + .iter() + .filter_map(|(id, entry)| { + if entry.get_ancestor_timeline_id() == Some(timeline_id) { + Some(*id) + } else { + None + } + }) + .collect(); + + if !children.is_empty() { + return Err(DeleteTimelineError::HasChildren(children)); + } + + let timeline_entry = match timelines.entry(timeline_id) { + Entry::Occupied(e) => e, + Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound), + }; + + timeline = Arc::clone(timeline_entry.get()); + + // Prevent two tasks from trying to delete the timeline at the same time. + // + // XXX: We should perhaps return an HTTP "202 Accepted" to signal that the caller + // needs to poll until the operation has finished. But for now, we return an + // error, because the control plane knows to retry errors. + + delete_lock_guard = + Arc::clone(&timeline.delete_lock) + .try_lock_owned() + .map_err(|_| { + DeleteTimelineError::Other(anyhow::anyhow!( + "timeline deletion is already in progress" + )) + })?; + + // If another task finished the deletion just before we acquired the lock, + // return success. + if *delete_lock_guard { + return Ok(()); + } + + timeline.set_state(TimelineState::Stopping); + + drop(timelines); + } + + // Now that the Timeline is in Stopping state, request all the related tasks to + // shut down. + // + // NB: If this fails half-way through, and is retried, the retry will go through + // all the same steps again. Make sure the code here is idempotent, and don't + // error out if some of the shutdown tasks have already been completed! + + // Stop the walreceiver first. + debug!("waiting for wal receiver to shutdown"); + let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() }; + if let Some(walreceiver) = maybe_started_walreceiver { + walreceiver.stop().await; + } + debug!("wal receiver shutdown confirmed"); + + // Prevent new uploads from starting. + if let Some(remote_client) = timeline.remote_client.as_ref() { + let res = remote_client.stop(); + match res { + Ok(()) => {} + Err(e) => match e { + remote_timeline_client::StopError::QueueUninitialized => { + // This case shouldn't happen currently because the + // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart. + // That is, before we declare the Tenant as Active. + // But we only allow calls to delete_timeline on Active tenants. + return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs"))); + } + }, + } + } + + // Stop & wait for the remaining timeline tasks, including upload tasks. + // NB: This and other delete_timeline calls do not run as a task_mgr task, + // so, they are not affected by this shutdown_tasks() call. + info!("waiting for timeline tasks to shutdown"); + task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await; + + // Mark timeline as deleted in S3 so we won't pick it up next time + // during attach or pageserver restart. + // See comment in persist_index_part_with_deleted_flag. + if let Some(remote_client) = timeline.remote_client.as_ref() { + match remote_client.persist_index_part_with_deleted_flag().await { + // If we (now, or already) marked it successfully as deleted, we can proceed + Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), + // Bail out otherwise + // + // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents + // two tasks from performing the deletion at the same time. The first task + // that starts deletion should run it to completion. + Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) + | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { + return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); + } + } + } + self.schedule_delete_timeline(timeline_id, timeline, delete_lock_guard); + + Ok(()) + } + + fn schedule_delete_timeline( + self: Arc, + timeline_id: TimelineId, + timeline: Arc, + _guard: OwnedMutexGuard, + ) { + let tenant_id = self.tenant_id; + let timeline_clone = Arc::clone(&timeline); + + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + TaskKind::TimelineDeletionWorker, + Some(self.tenant_id), + Some(timeline_id), + "timeline_delete", + false, + async move { + if let Err(err) = self.delete_timeline(timeline_id, timeline).await { + error!("Error: {err:#}"); + timeline_clone.set_broken(err.to_string()) + }; + Ok(()) + } + .instrument({ + let span = + tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id); + span.follows_from(Span::current()); + span + }), + ); + } + pub fn current_state(&self) -> TenantState { self.state.borrow().clone() } @@ -1588,147 +1820,257 @@ impl Tenant { } /// Changes tenant status to active, unless shutdown was already requested. - fn activate(&self, ctx: &RequestContext) -> anyhow::Result<()> { + /// + /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup + /// to delay background jobs. Background jobs can be started right away when None is given. + fn activate( + self: &Arc, + broker_client: BrokerClientChannel, + background_jobs_can_start: Option<&completion::Barrier>, + ctx: &RequestContext, + ) { debug_assert_current_span_has_tenant_id(); - let mut result = Ok(()); + let mut activating = false; self.state.send_modify(|current_state| { + use pageserver_api::models::ActivatingFrom; match &*current_state { - TenantState::Active => { - // activate() was called on an already Active tenant. Shouldn't happen. - result = Err(anyhow::anyhow!("Tenant is already active")); + TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping => { + panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state); } - TenantState::Broken { reason, .. } => { - // This shouldn't happen either - result = Err(anyhow::anyhow!( - "Could not activate tenant because it is in broken state due to: {reason}", - )); + TenantState::Loading => { + *current_state = TenantState::Activating(ActivatingFrom::Loading); } - TenantState::Stopping => { - // The tenant was detached, or system shutdown was requested, while we were - // loading or attaching the tenant. - info!("Tenant is already in Stopping state, skipping activation"); - } - TenantState::Loading | TenantState::Attaching => { - *current_state = TenantState::Active; - - debug!(tenant_id = %self.tenant_id, "Activating tenant"); - - let timelines_accessor = self.timelines.lock().unwrap(); - let not_broken_timelines = timelines_accessor - .values() - .filter(|timeline| timeline.current_state() != TimelineState::Broken); - - // Spawn gc and compaction loops. The loops will shut themselves - // down when they notice that the tenant is inactive. - tasks::start_background_loops(self.tenant_id); - - let mut activated_timelines = 0; - let mut timelines_broken_during_activation = 0; - - for timeline in not_broken_timelines { - match timeline - .activate(ctx) - .context("timeline activation for activating tenant") - { - Ok(()) => { - activated_timelines += 1; - } - Err(e) => { - error!( - "Failed to activate timeline {}: {:#}", - timeline.timeline_id, e - ); - timeline.set_state(TimelineState::Broken); - *current_state = TenantState::broken_from_reason(format!( - "failed to activate timeline {}: {}", - timeline.timeline_id, e - )); - - timelines_broken_during_activation += 1; - } - } - } - - let elapsed = self.loading_started_at.elapsed(); - let total_timelines = timelines_accessor.len(); - - // log a lot of stuff, because some tenants sometimes suffer from user-visible - // times to activate. see https://github.com/neondatabase/neon/issues/4025 - info!( - since_creation_millis = elapsed.as_millis(), - tenant_id = %self.tenant_id, - activated_timelines, - timelines_broken_during_activation, - total_timelines, - post_state = <&'static str>::from(&*current_state), - "activation attempt finished" - ); + TenantState::Attaching => { + *current_state = TenantState::Activating(ActivatingFrom::Attaching); } } + debug!(tenant_id = %self.tenant_id, "Activating tenant"); + activating = true; + // Continue outside the closure. We need to grab timelines.lock() + // and we plan to turn it into a tokio::sync::Mutex in a future patch. }); - result + + if activating { + let timelines_accessor = self.timelines.lock().unwrap(); + let timelines_to_activate = timelines_accessor + .values() + .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); + + // Spawn gc and compaction loops. The loops will shut themselves + // down when they notice that the tenant is inactive. + tasks::start_background_loops(self, background_jobs_can_start); + + let mut activated_timelines = 0; + + for timeline in timelines_to_activate { + timeline.activate(broker_client.clone(), background_jobs_can_start, ctx); + activated_timelines += 1; + } + + self.state.send_modify(move |current_state| { + assert!( + matches!(current_state, TenantState::Activating(_)), + "set_stopping and set_broken wait for us to leave Activating state", + ); + *current_state = TenantState::Active; + + let elapsed = self.loading_started_at.elapsed(); + let total_timelines = timelines_accessor.len(); + + // log a lot of stuff, because some tenants sometimes suffer from user-visible + // times to activate. see https://github.com/neondatabase/neon/issues/4025 + info!( + since_creation_millis = elapsed.as_millis(), + tenant_id = %self.tenant_id, + activated_timelines, + total_timelines, + post_state = <&'static str>::from(&*current_state), + "activation attempt finished" + ); + }); + } } - /// Change tenant status to Stopping, to mark that it is being shut down - pub fn set_stopping(&self) { - self.state.send_modify(|current_state| { - match current_state { - TenantState::Active | TenantState::Loading | TenantState::Attaching => { - *current_state = TenantState::Stopping; + /// Shutdown the tenant and join all of the spawned tasks. + /// + /// The method caters for all use-cases: + /// - pageserver shutdown (freeze_and_flush == true) + /// - detach + ignore (freeze_and_flush == false) + /// + /// This will attempt to shutdown even if tenant is broken. + pub(crate) async fn shutdown(&self, freeze_and_flush: bool) -> Result<(), ShutdownError> { + debug_assert_current_span_has_tenant_id(); + // Set tenant (and its timlines) to Stoppping state. + // + // Since we can only transition into Stopping state after activation is complete, + // run it in a JoinSet so all tenants have a chance to stop before we get SIGKILLed. + // + // Transitioning tenants to Stopping state has a couple of non-obvious side effects: + // 1. Lock out any new requests to the tenants. + // 2. Signal cancellation to WAL receivers (we wait on it below). + // 3. Signal cancellation for other tenant background loops. + // 4. ??? + // + // The waiting for the cancellation is not done uniformly. + // We certainly wait for WAL receivers to shut down. + // That is necessary so that no new data comes in before the freeze_and_flush. + // But the tenant background loops are joined-on in our caller. + // It's mesed up. + // we just ignore the failure to stop + match self.set_stopping().await { + Ok(()) => {} + Err(SetStoppingError::Broken) => { + // assume that this is acceptable + } + Err(SetStoppingError::AlreadyStopping) => return Err(ShutdownError::AlreadyStopping), + }; - // FIXME: If the tenant is still Loading or Attaching, new timelines - // might be created after this. That's harmless, as the Timelines - // won't be accessible to anyone, when the Tenant is in Stopping - // state. - let timelines_accessor = self.timelines.lock().unwrap(); - let not_broken_timelines = timelines_accessor - .values() - .filter(|timeline| timeline.current_state() != TimelineState::Broken); - for timeline in not_broken_timelines { - timeline.set_state(TimelineState::Stopping); - } - } - TenantState::Broken { reason, .. } => { - info!("Cannot set tenant to Stopping state, it is in Broken state due to: {reason}"); - } - TenantState::Stopping => { - // The tenant was detached, or system shutdown was requested, while we were - // loading or attaching the tenant. - info!("Tenant is already in Stopping state"); - } + if freeze_and_flush { + // walreceiver has already began to shutdown with TenantState::Stopping, but we need to + // await for them to stop. + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(self.tenant_id), + None, + ) + .await; + + // this will wait for uploads to complete; in the past, it was done outside tenant + // shutdown in pageserver::shutdown_pageserver. + self.freeze_and_flush_on_shutdown().await; + } + + // shutdown all tenant and timeline tasks: gc, compaction, page service + // No new tasks will be started for this tenant because it's in `Stopping` state. + // + // this will additionally shutdown and await all timeline tasks. + task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await; + + Ok(()) + } + + /// Change tenant status to Stopping, to mark that it is being shut down. + /// + /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state. + /// + /// This function is not cancel-safe! + async fn set_stopping(&self) -> Result<(), SetStoppingError> { + let mut rx = self.state.subscribe(); + + // cannot stop before we're done activating, so wait out until we're done activating + rx.wait_for(|state| match state { + TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => { + info!( + "waiting for {} to turn Active|Broken|Stopping", + <&'static str>::from(state) + ); + false + } + TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping {} => true, + }) + .await + .expect("cannot drop self.state while on a &self method"); + + // we now know we're done activating, let's see whether this task is the winner to transition into Stopping + let mut err = None; + let stopping = self.state.send_if_modified(|current_state| match current_state { + TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => { + unreachable!("we ensured above that we're done with activation, and, there is no re-activation") + } + TenantState::Active => { + // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines + // are created after the transition to Stopping. That's harmless, as the Timelines + // won't be accessible to anyone afterwards, because the Tenant is in Stopping state. + *current_state = TenantState::Stopping; + // Continue stopping outside the closure. We need to grab timelines.lock() + // and we plan to turn it into a tokio::sync::Mutex in a future patch. + true + } + TenantState::Broken { reason, .. } => { + info!( + "Cannot set tenant to Stopping state, it is in Broken state due to: {reason}" + ); + err = Some(SetStoppingError::Broken); + false + } + TenantState::Stopping => { + info!("Tenant is already in Stopping state"); + err = Some(SetStoppingError::AlreadyStopping); + false } }); + match (stopping, err) { + (true, None) => {} // continue + (false, Some(err)) => return Err(err), + (true, Some(_)) => unreachable!( + "send_if_modified closure must error out if not transitioning to Stopping" + ), + (false, None) => unreachable!( + "send_if_modified closure must return true if transitioning to Stopping" + ), + } + + let timelines_accessor = self.timelines.lock().unwrap(); + let not_broken_timelines = timelines_accessor + .values() + .filter(|timeline| !timeline.is_broken()); + for timeline in not_broken_timelines { + timeline.set_state(TimelineState::Stopping); + } + Ok(()) } - pub fn set_broken(&self, reason: String) { + /// Method for tenant::mgr to transition us into Broken state in case of a late failure in + /// `remove_tenant_from_memory` + /// + /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state. + /// + /// In tests, we also use this to set tenants to Broken state on purpose. + pub(crate) async fn set_broken(&self, reason: String) { + let mut rx = self.state.subscribe(); + + // The load & attach routines own the tenant state until it has reached `Active`. + // So, wait until it's done. + rx.wait_for(|state| match state { + TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => { + info!( + "waiting for {} to turn Active|Broken|Stopping", + <&'static str>::from(state) + ); + false + } + TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping {} => true, + }) + .await + .expect("cannot drop self.state while on a &self method"); + + // we now know we're done activating, let's see whether this task is the winner to transition into Broken self.state.send_modify(|current_state| { match *current_state { + TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => { + unreachable!("we ensured above that we're done with activation, and, there is no re-activation") + } TenantState::Active => { - // Broken tenants can currently only used for fatal errors that happen - // while loading or attaching a tenant. A tenant that has already been - // activated should never be marked as broken. We cope with it the best - // we can, but it shouldn't happen. - warn!("Changing Active tenant to Broken state, reason: {}", reason); - *current_state = TenantState::broken_from_reason(reason); + if cfg!(feature = "testing") { + warn!("Changing Active tenant to Broken state, reason: {}", reason); + *current_state = TenantState::broken_from_reason(reason); + } else { + unreachable!("not allowed to call set_broken on Active tenants in non-testing builds") + } } TenantState::Broken { .. } => { - // This shouldn't happen either warn!("Tenant is already in Broken state"); } + // This is the only "expected" path, any other path is a bug. TenantState::Stopping => { - // This shouldn't happen either warn!( "Marking Stopping tenant as Broken state, reason: {}", reason ); *current_state = TenantState::broken_from_reason(reason); } - TenantState::Loading | TenantState::Attaching => { - info!("Setting tenant as Broken state, reason: {}", reason); - *current_state = TenantState::broken_from_reason(reason); - } - } + } }); } @@ -1736,25 +2078,30 @@ impl Tenant { self.state.subscribe() } - pub async fn wait_to_become_active(&self) -> anyhow::Result<()> { + pub(crate) async fn wait_to_become_active(&self) -> Result<(), WaitToBecomeActiveError> { let mut receiver = self.state.subscribe(); loop { let current_state = receiver.borrow_and_update().clone(); match current_state { - TenantState::Loading | TenantState::Attaching => { + TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => { // in these states, there's a chance that we can reach ::Active - receiver.changed().await?; + receiver.changed().await.map_err( + |_e: tokio::sync::watch::error::RecvError| { + WaitToBecomeActiveError::TenantDropped { + tenant_id: self.tenant_id, + } + }, + )?; } TenantState::Active { .. } => { return Ok(()); } TenantState::Broken { .. } | TenantState::Stopping => { // There's no chance the tenant can transition back into ::Active - anyhow::bail!( - "Tenant {} will not become active. Current state: {:?}", - self.tenant_id, - ¤t_state, - ); + return Err(WaitToBecomeActiveError::WillNotBecomeActive { + tenant_id: self.tenant_id, + state: current_state, + }); } } } @@ -1908,6 +2255,7 @@ impl Tenant { new_metadata: &TimelineMetadata, ancestor: Option>, remote_client: Option, + init_order: Option<&InitializationOrder>, ) -> anyhow::Result> { if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() { anyhow::ensure!( @@ -1916,6 +2264,9 @@ impl Tenant { ) } + let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start); + let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt); + let pg_version = new_metadata.pg_version(); Ok(Timeline::new( self.conf, @@ -1927,6 +2278,8 @@ impl Tenant { Arc::clone(&self.walredo_mgr), remote_client, pg_version, + initial_logical_size_can_start.cloned(), + initial_logical_size_attempt.cloned(), )) } @@ -2278,13 +2631,45 @@ impl Tenant { Ok(gc_timelines) } - /// Branch an existing timeline + /// A substitute for `branch_timeline` for use in unit tests. + /// The returned timeline will have state value `Active` to make various `anyhow::ensure!()` + /// calls pass, but, we do not actually call `.activate()` under the hood. So, none of the + /// timeline background tasks are launched, except the flush loop. + #[cfg(test)] + async fn branch_timeline_test( + &self, + src_timeline: &Arc, + dst_id: TimelineId, + start_lsn: Option, + ctx: &RequestContext, + ) -> anyhow::Result> { + let tl = self + .branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx) + .await?; + tl.set_state(TimelineState::Active); + Ok(tl) + } + + /// Branch an existing timeline. + /// + /// The caller is responsible for activating the returned timeline. async fn branch_timeline( &self, src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, ctx: &RequestContext, + ) -> anyhow::Result> { + self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx) + .await + } + + async fn branch_timeline_impl( + &self, + src_timeline: &Arc, + dst_id: TimelineId, + start_lsn: Option, + ctx: &RequestContext, ) -> anyhow::Result> { let src_id = src_timeline.timeline_id; @@ -2378,7 +2763,7 @@ impl Tenant { false, Some(Arc::clone(src_timeline)), )? - .initialize_with_lock(ctx, &mut timelines, true, true)? + .initialize_with_lock(ctx, &mut timelines, true)? }; // Root timeline gets its layers during creation and uploads them along with the metadata. @@ -2399,6 +2784,8 @@ impl Tenant { /// - run initdb to init temporary instance and get bootstrap data /// - after initialization complete, remove the temp dir. + /// + /// The caller is responsible for activating the returned timeline. async fn bootstrap_timeline( &self, timeline_id: TimelineId, @@ -2493,7 +2880,7 @@ impl Tenant { // map above, when we imported the datadir. let timeline = { let mut timelines = self.timelines.lock().unwrap(); - raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)? + raw_timeline.initialize_with_lock(ctx, &mut timelines, false)? }; info!( @@ -2568,7 +2955,7 @@ impl Tenant { remote_client: Option, ) -> anyhow::Result> { let timeline_data = self - .create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client) + .create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client, None) .context("Failed to create timeline data structure")?; crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?; @@ -3027,6 +3414,7 @@ pub mod harness { evictions_low_residence_duration_metric_threshold: Some( tenant_conf.evictions_low_residence_duration_metric_threshold, ), + gc_feedback: Some(tenant_conf.gc_feedback), } } } @@ -3134,8 +3522,14 @@ pub mod harness { let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?; timelines_to_load.insert(timeline_id, timeline_metadata); } - // FIXME starts background jobs - tenant.load(ctx).await?; + tenant + .load(None, ctx) + .instrument(info_span!("try_load", tenant_id=%self.tenant_id)) + .await?; + tenant.state.send_replace(TenantState::Active); + for timeline in tenant.timelines.lock().unwrap().values() { + timeline.set_state(TimelineState::Active); + } Ok(tenant) } @@ -3193,8 +3587,7 @@ mod tests { #[tokio::test] async fn test_basic() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await; - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let tline = tline.initialize(&ctx)?; + let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -3227,9 +3620,7 @@ mod tests { let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")? .load() .await; - let timeline = - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let _ = timeline.initialize(&ctx)?; + let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) { Ok(_) => panic!("duplicate timeline creation should fail"), @@ -3260,8 +3651,7 @@ mod tests { use std::str::from_utf8; let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await; - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let tline = tline.initialize(&ctx)?; + let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; let writer = tline.writer(); #[allow(non_snake_case)] @@ -3283,7 +3673,7 @@ mod tests { // Branch the history, modify relation differently on the new timeline tenant - .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x30)), &ctx) + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x30)), &ctx) .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) @@ -3358,8 +3748,7 @@ mod tests { TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? .load() .await; - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let tline = tline.initialize(&ctx)?; + let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 @@ -3372,7 +3761,7 @@ mod tests { // try to branch at lsn 25, should fail because we already garbage collected the data match tenant - .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx) + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx) .await { Ok(_) => panic!("branching should have failed"), @@ -3396,12 +3785,11 @@ mod tests { .load() .await; - let tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx)? - .initialize(&ctx)?; + let tline = + tenant.create_test_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx)?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 match tenant - .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx) + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx) .await { Ok(_) => panic!("branching should have failed"), @@ -3447,13 +3835,11 @@ mod tests { TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")? .load() .await; - let tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)? - .initialize(&ctx)?; + let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; tenant - .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) @@ -3461,7 +3847,7 @@ mod tests { make_some_layers(newtline.as_ref(), Lsn(0x60)).await?; - tline.set_state(TimelineState::Broken); + tline.set_broken("test".to_owned()); tenant .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx) @@ -3497,12 +3883,11 @@ mod tests { TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")? .load() .await; - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let tline = tline.initialize(&ctx)?; + let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; tenant - .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) @@ -3521,12 +3906,11 @@ mod tests { TenantHarness::create("test_parent_keeps_data_forever_after_branching")? .load() .await; - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let tline = tline.initialize(&ctx)?; + let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; tenant - .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) @@ -3555,8 +3939,7 @@ mod tests { { let (tenant, ctx) = harness.load().await; let tline = - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION, &ctx)?; - let tline = tline.initialize(&ctx)?; + tenant.create_test_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION, &ctx)?; make_some_layers(tline.as_ref(), Lsn(0x8000)).await?; } @@ -3576,14 +3959,14 @@ mod tests { { let (tenant, ctx) = harness.load().await; let tline = - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let tline = tline.initialize(&ctx)?; + tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; - tenant - .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) + let child_tline = tenant + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) .await?; + child_tline.set_state(TimelineState::Active); let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) @@ -3613,9 +3996,8 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; let (tenant, ctx) = harness.load().await; - tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)? - .initialize(&ctx)?; + let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + drop(tline); drop(tenant); let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); @@ -3652,8 +4034,7 @@ mod tests { #[tokio::test] async fn test_images() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_images")?.load().await; - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let tline = tline.initialize(&ctx)?; + let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -3718,8 +4099,7 @@ mod tests { #[tokio::test] async fn test_bulk_insert() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await; - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let tline = tline.initialize(&ctx)?; + let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; let mut lsn = Lsn(0x10); @@ -3761,8 +4141,7 @@ mod tests { #[tokio::test] async fn test_random_updates() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await; - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let tline = tline.initialize(&ctx)?; + let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; const NUM_KEYS: usize = 1000; @@ -3835,9 +4214,8 @@ mod tests { let (tenant, ctx) = TenantHarness::create("test_traverse_branches")? .load() .await; - let mut tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)? - .initialize(&ctx)?; + let mut tline = + tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; const NUM_KEYS: usize = 1000; @@ -3870,7 +4248,7 @@ mod tests { for _ in 0..50 { let new_tline_id = TimelineId::generate(); tenant - .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx) + .branch_timeline_test(&tline, new_tline_id, Some(lsn), &ctx) .await?; tline = tenant .get_timeline(new_tline_id, true) @@ -3919,9 +4297,8 @@ mod tests { let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")? .load() .await; - let mut tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)? - .initialize(&ctx)?; + let mut tline = + tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; const NUM_KEYS: usize = 100; const NUM_TLINES: usize = 50; @@ -3936,7 +4313,7 @@ mod tests { for idx in 0..NUM_TLINES { let new_tline_id = TimelineId::generate(); tenant - .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx) + .branch_timeline_test(&tline, new_tline_id, Some(lsn), &ctx) .await?; tline = tenant .get_timeline(new_tline_id, true) diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 50de316bc4..80d153661a 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -99,6 +99,7 @@ pub struct TenantConf { // See the corresponding metric's help string. #[serde(with = "humantime_serde")] pub evictions_low_residence_duration_metric_threshold: Duration, + pub gc_feedback: bool, } /// Same as TenantConf, but this struct preserves the information about @@ -175,6 +176,10 @@ pub struct TenantConfOpt { #[serde(with = "humantime_serde")] #[serde(default)] pub evictions_low_residence_duration_metric_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub gc_feedback: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -242,6 +247,7 @@ impl TenantConfOpt { evictions_low_residence_duration_metric_threshold: self .evictions_low_residence_duration_metric_threshold .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold), + gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback), } } } @@ -278,6 +284,7 @@ impl Default for TenantConf { DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, ) .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), + gc_feedback: false, } } } @@ -372,6 +379,7 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { ))?, ); } + tenant_conf.gc_feedback = request_data.gc_feedback; Ok(tenant_conf) } diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 8d06ccd565..ca1a71b623 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -51,7 +51,9 @@ use crate::keyspace::KeyPartitioning; use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; +use anyhow::Context; use anyhow::Result; +use std::collections::HashMap; use std::collections::VecDeque; use std::ops::Range; use std::sync::Arc; @@ -61,6 +63,8 @@ use historic_layer_coverage::BufferedHistoricLayerCoverage; pub use historic_layer_coverage::Replacement; use super::storage_layer::range_eq; +use super::storage_layer::PersistentLayerDesc; +use super::storage_layer::PersistentLayerKey; /// /// LayerMap tracks what layers exist on a timeline. @@ -86,11 +90,16 @@ pub struct LayerMap { pub frozen_layers: VecDeque>, /// Index of the historic layers optimized for search - historic: BufferedHistoricLayerCoverage>, + historic: BufferedHistoricLayerCoverage>, /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. - l0_delta_layers: Vec>, + l0_delta_layers: Vec>, + + /// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and + /// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and + /// RemoteLayer will be removed. + mapping: HashMap>, } impl Default for LayerMap { @@ -101,6 +110,7 @@ impl Default for LayerMap { frozen_layers: VecDeque::default(), l0_delta_layers: Vec::default(), historic: BufferedHistoricLayerCoverage::default(), + mapping: HashMap::default(), } } } @@ -125,8 +135,9 @@ where /// /// Insert an on-disk layer. /// - pub fn insert_historic(&mut self, layer: Arc) { - self.layer_map.insert_historic_noflush(layer) + // TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap` + pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc) { + self.layer_map.insert_historic_noflush(layer_desc, layer) } /// @@ -134,8 +145,8 @@ where /// /// This should be called when the corresponding file on disk has been deleted. /// - pub fn remove_historic(&mut self, layer: Arc) { - self.layer_map.remove_historic_noflush(layer) + pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc) { + self.layer_map.remove_historic_noflush(layer_desc, layer) } /// Replaces existing layer iff it is the `expected`. @@ -150,12 +161,15 @@ where /// that we can replace values only by updating a hashmap. pub fn replace_historic( &mut self, + expected_desc: PersistentLayerDesc, expected: &Arc, + new_desc: PersistentLayerDesc, new: Arc, ) -> anyhow::Result>> { fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound)); - self.layer_map.replace_historic_noflush(expected, new) + self.layer_map + .replace_historic_noflush(expected_desc, expected, new_desc, new) } // We will flush on drop anyway, but this method makes it @@ -230,6 +244,7 @@ where (None, None) => None, (None, Some(image)) => { let lsn_floor = image.get_lsn_range().start; + let image = self.get_layer_from_mapping(&image.key()).clone(); Some(SearchResult { layer: image, lsn_floor, @@ -237,6 +252,7 @@ where } (Some(delta), None) => { let lsn_floor = delta.get_lsn_range().start; + let delta = self.get_layer_from_mapping(&delta.key()).clone(); Some(SearchResult { layer: delta, lsn_floor, @@ -247,6 +263,7 @@ where let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end; let image_exact_match = img_lsn + 1 == end_lsn; if image_is_newer || image_exact_match { + let image = self.get_layer_from_mapping(&image.key()).clone(); Some(SearchResult { layer: image, lsn_floor: img_lsn, @@ -254,6 +271,7 @@ where } else { let lsn_floor = std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); + let delta = self.get_layer_from_mapping(&delta.key()).clone(); Some(SearchResult { layer: delta, lsn_floor, @@ -273,16 +291,33 @@ where /// /// Helper function for BatchedUpdates::insert_historic /// - pub(self) fn insert_historic_noflush(&mut self, layer: Arc) { + /// TODO(chi): remove L generic so that we do not need to pass layer object. + pub(self) fn insert_historic_noflush( + &mut self, + layer_desc: PersistentLayerDesc, + layer: Arc, + ) { + self.mapping.insert(layer_desc.key(), layer.clone()); + // TODO: See #3869, resulting #4088, attempted fix and repro #4094 - self.historic.insert( - historic_layer_coverage::LayerKey::from(&*layer), - Arc::clone(&layer), - ); if Self::is_l0(&layer) { - self.l0_delta_layers.push(layer); + self.l0_delta_layers.push(layer_desc.clone().into()); } + + self.historic.insert( + historic_layer_coverage::LayerKey::from(&*layer), + layer_desc.into(), + ); + } + + fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc { + let layer = self + .mapping + .get(key) + .with_context(|| format!("{key:?}")) + .expect("inconsistent layer mapping"); + layer } /// @@ -290,14 +325,16 @@ where /// /// Helper function for BatchedUpdates::remove_historic /// - pub fn remove_historic_noflush(&mut self, layer: Arc) { + pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc) { self.historic .remove(historic_layer_coverage::LayerKey::from(&*layer)); - if Self::is_l0(&layer) { let len_before = self.l0_delta_layers.len(); - self.l0_delta_layers - .retain(|other| !Self::compare_arced_layers(other, &layer)); + let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers); + l0_delta_layers.retain(|other| { + !Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer) + }); + self.l0_delta_layers = l0_delta_layers; // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers, // there's a chance that the comparison fails at runtime due to it comparing (pointer, // vtable) pairs. @@ -307,11 +344,14 @@ where "failed to locate removed historic layer from l0_delta_layers" ); } + self.mapping.remove(&layer_desc.key()); } pub(self) fn replace_historic_noflush( &mut self, + expected_desc: PersistentLayerDesc, expected: &Arc, + new_desc: PersistentLayerDesc, new: Arc, ) -> anyhow::Result>> { let key = historic_layer_coverage::LayerKey::from(&**expected); @@ -332,10 +372,9 @@ where let l0_index = if expected_l0 { // find the index in case replace worked, we need to replace that as well - let pos = self - .l0_delta_layers - .iter() - .position(|slot| Self::compare_arced_layers(slot, expected)); + let pos = self.l0_delta_layers.iter().position(|slot| { + Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected) + }); if pos.is_none() { return Ok(Replacement::NotFound); @@ -345,16 +384,28 @@ where None }; - let replaced = self.historic.replace(&key, new.clone(), |existing| { - Self::compare_arced_layers(existing, expected) + let new_desc = Arc::new(new_desc); + let replaced = self.historic.replace(&key, new_desc.clone(), |existing| { + **existing == expected_desc }); if let Replacement::Replaced { .. } = &replaced { + self.mapping.remove(&expected_desc.key()); + self.mapping.insert(new_desc.key(), new); if let Some(index) = l0_index { - self.l0_delta_layers[index] = new; + self.l0_delta_layers[index] = new_desc; } } + let replaced = match replaced { + Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered }, + Replacement::NotFound => Replacement::NotFound, + Replacement::RemovalBuffered => Replacement::RemovalBuffered, + Replacement::Unexpected(x) => { + Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone()) + } + }; + Ok(replaced) } @@ -383,7 +434,7 @@ where let start = key.start.to_i128(); let end = key.end.to_i128(); - let layer_covers = |layer: Option>| match layer { + let layer_covers = |layer: Option>| match layer { Some(layer) => layer.get_lsn_range().start >= lsn.start, None => false, }; @@ -404,7 +455,9 @@ where } pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { - self.historic.iter() + self.historic + .iter() + .map(|x| self.get_layer_from_mapping(&x.key()).clone()) } /// @@ -436,14 +489,24 @@ where // Loop through the change events and push intervals for (change_key, change_val) in version.image_coverage.range(start..end) { let kr = Key::from_i128(current_key)..Key::from_i128(change_key); - coverage.push((kr, current_val.take())); + coverage.push(( + kr, + current_val + .take() + .map(|l| self.get_layer_from_mapping(&l.key()).clone()), + )); current_key = change_key; current_val = change_val.clone(); } // Add the final interval let kr = Key::from_i128(current_key)..Key::from_i128(end); - coverage.push((kr, current_val.take())); + coverage.push(( + kr, + current_val + .take() + .map(|l| self.get_layer_from_mapping(&l.key()).clone()), + )); Ok(coverage) } @@ -532,7 +595,9 @@ where let kr = Key::from_i128(current_key)..Key::from_i128(change_key); let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = Self::is_reimage_worthy(&val, key) as usize; + let base_count = + Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key) + as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?; @@ -555,7 +620,9 @@ where let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = Self::is_reimage_worthy(&val, key) as usize; + let base_count = + Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key) + as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?; max_stacked_deltas = std::cmp::max( @@ -706,7 +773,11 @@ where /// Return all L0 delta layers pub fn get_level0_deltas(&self) -> Result>> { - Ok(self.l0_delta_layers.clone()) + Ok(self + .l0_delta_layers + .iter() + .map(|x| self.get_layer_from_mapping(&x.key()).clone()) + .collect()) } /// debugging function to print out the contents of the layer map @@ -809,12 +880,17 @@ mod tests { let layer = LayerDescriptor::from(layer); // same skeletan construction; see scenario below - let not_found: Arc = Arc::new(layer.clone()); - let new_version: Arc = Arc::new(layer); + let not_found = Arc::new(layer.clone()); + let new_version = Arc::new(layer); let mut map = LayerMap::default(); - let res = map.batch_update().replace_historic(¬_found, new_version); + let res = map.batch_update().replace_historic( + not_found.get_persistent_layer_desc(), + ¬_found, + new_version.get_persistent_layer_desc(), + new_version, + ); assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}"); } @@ -823,8 +899,8 @@ mod tests { let name = LayerFileName::from_str(layer_name).unwrap(); let skeleton = LayerDescriptor::from(name); - let remote: Arc = Arc::new(skeleton.clone()); - let downloaded: Arc = Arc::new(skeleton); + let remote = Arc::new(skeleton.clone()); + let downloaded = Arc::new(skeleton); let mut map = LayerMap::default(); @@ -834,12 +910,18 @@ mod tests { let expected_in_counts = (1, usize::from(expected_l0)); - map.batch_update().insert_historic(remote.clone()); + map.batch_update() + .insert_historic(remote.get_persistent_layer_desc(), remote.clone()); assert_eq!(count_layer_in(&map, &remote), expected_in_counts); let replaced = map .batch_update() - .replace_historic(&remote, downloaded.clone()) + .replace_historic( + remote.get_persistent_layer_desc(), + &remote, + downloaded.get_persistent_layer_desc(), + downloaded.clone(), + ) .expect("name derived attributes are the same"); assert!( matches!(replaced, Replacement::Replaced { .. }), @@ -847,11 +929,12 @@ mod tests { ); assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts); - map.batch_update().remove_historic(downloaded.clone()); + map.batch_update() + .remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone()); assert_eq!(count_layer_in(&map, &downloaded), (0, 0)); } - fn count_layer_in(map: &LayerMap, layer: &Arc) -> (usize, usize) { + fn count_layer_in(map: &LayerMap, layer: &Arc) -> (usize, usize) { let historic = map .iter_historic_layers() .filter(|x| LayerMap::compare_arced_layers(x, layer)) diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index b63c361314..49dcbc63c2 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -204,6 +204,35 @@ fn test_off_by_one() { assert_eq!(version.image_coverage.query(5), None); } +/// White-box regression test, checking for incorrect removal of node at key.end +#[test] +fn test_regression() { + let mut map = HistoricLayerCoverage::::new(); + map.insert( + LayerKey { + key: 0..5, + lsn: 0..5, + is_image: false, + }, + "Layer 1".to_string(), + ); + map.insert( + LayerKey { + key: 0..5, + lsn: 1..2, + is_image: false, + }, + "Layer 2".to_string(), + ); + + // If an insertion operation improperly deletes the endpoint of a previous layer + // (which is more likely to happen with layers that collide on key.end), we will + // end up with an infinite layer, covering the entire keyspace. Here we assert + // that there's no layer at key 100 because we didn't insert any layer there. + let version = map.get_version(100).unwrap(); + assert_eq!(version.delta_coverage.query(100), None); +} + /// Cover edge cases where layers begin or end on the same key #[test] fn test_key_collision() { diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs index 4e3b4516dc..47aace97a5 100644 --- a/pageserver/src/tenant/layer_map/layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/layer_coverage.rs @@ -1,8 +1,8 @@ use std::ops::Range; -// TODO the `im` crate has 20x more downloads and also has -// persistent/immutable BTree. It also runs a bit faster but -// results are not the same on some tests. +// NOTE the `im` crate has 20x more downloads and also has +// persistent/immutable BTree. But it's bugged so rpds is a +// better choice https://github.com/neondatabase/neon/issues/3395 use rpds::RedBlackTreeMapSync; /// Data structure that can efficiently: @@ -10,19 +10,22 @@ use rpds::RedBlackTreeMapSync; /// - iterate the latest layers in a key range /// - insert layers in non-decreasing lsn.start order /// -/// The struct is parameterized over Value for easier -/// testing, but in practice it's some sort of layer. +/// For a detailed explanation and justification of this approach, see: +/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing +/// +/// NOTE The struct is parameterized over Value for easier +/// testing, but in practice it's some sort of layer. pub struct LayerCoverage { /// For every change in coverage (as we sweep the key space) /// we store (lsn.end, value). /// - /// We use an immutable/persistent tree so that we can keep historic - /// versions of this coverage without cloning the whole thing and - /// incurring quadratic memory cost. See HistoricLayerCoverage. + /// NOTE We use an immutable/persistent tree so that we can keep historic + /// versions of this coverage without cloning the whole thing and + /// incurring quadratic memory cost. See HistoricLayerCoverage. /// - /// We use the Sync version of the map because we want Self to - /// be Sync. Using nonsync might be faster, if we can work with - /// that. + /// NOTE We use the Sync version of the map because we want Self to + /// be Sync. Using nonsync might be faster, if we can work with + /// that. nodes: RedBlackTreeMapSync>, } @@ -41,6 +44,13 @@ impl LayerCoverage { /// Helper function to subdivide the key range without changing any values /// + /// This operation has no semantic effect by itself. It only helps us pin in + /// place the part of the coverage we don't want to change when inserting. + /// + /// As an analogy, think of a polygon. If you add a vertex along one of the + /// segments, the polygon is still the same, but it behaves differently when + /// we move or delete one of the other points. + /// /// Complexity: O(log N) fn add_node(&mut self, key: i128) { let value = match self.nodes.range(..=key).last() { @@ -74,7 +84,7 @@ impl LayerCoverage { let mut to_update = Vec::new(); let mut to_remove = Vec::new(); let mut prev_covered = false; - for (k, node) in self.nodes.range(key.clone()) { + for (k, node) in self.nodes.range(key) { let needs_cover = match node { None => true, Some((h, _)) => h < &lsn.end, @@ -87,9 +97,8 @@ impl LayerCoverage { } prev_covered = needs_cover; } - if !prev_covered { - to_remove.push(key.end); - } + // TODO check if the nodes inserted at key.start and key.end are safe + // to remove. It's fine to keep them but they could be redundant. for k in to_update { self.nodes.insert_mut(k, Some((lsn.end, value.clone()))); } diff --git a/pageserver/src/tenant/manifest.rs b/pageserver/src/tenant/manifest.rs new file mode 100644 index 0000000000..745437dfbd --- /dev/null +++ b/pageserver/src/tenant/manifest.rs @@ -0,0 +1,325 @@ +//! This module contains the encoding and decoding of the local manifest file. +//! +//! MANIFEST is a write-ahead log which is stored locally to each timeline. It +//! records the state of the storage engine. It contains a snapshot of the +//! state and all operations proceeding that snapshot. The file begins with a +//! header recording MANIFEST version number. After that, it contains a snapshot. +//! The snapshot is followed by a list of operations. Each operation is a list +//! of records. Each record is either an addition or a removal of a layer. +//! +//! With MANIFEST, we can: +//! +//! 1. recover state quickly by reading the file, potentially boosting the +//! startup speed. +//! 2. ensure all operations are atomic and avoid corruption, solving issues +//! like redundant image layer and preparing us for future compaction +//! strategies. +//! +//! There is also a format for storing all layer files on S3, called +//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which +//! records all operations as logs, and therefore we can easily replay the +//! operations when recovering from crash, while ensuring those operations +//! are atomic upon restart. +//! +//! Currently, this is not used in the system. Future refactors will ensure +//! the storage state will be recorded in this file, and the system can be +//! recovered from this file. This is tracked in +//! https://github.com/neondatabase/neon/issues/4418 + +use std::io::{self, Read, Write}; + +use crate::virtual_file::VirtualFile; +use anyhow::Result; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use crc32c::crc32c; +use serde::{Deserialize, Serialize}; +use tracing::log::warn; +use utils::lsn::Lsn; + +use super::storage_layer::PersistentLayerDesc; + +pub struct Manifest { + file: VirtualFile, +} + +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)] +pub struct Snapshot { + pub layers: Vec, +} + +/// serde by default encode this in tagged enum, and therefore it will be something +/// like `{ "AddLayer": { ... } }`. +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)] +pub enum Record { + AddLayer(PersistentLayerDesc), + RemoveLayer(PersistentLayerDesc), +} + +/// `echo neon.manifest | sha1sum` and take the leading 8 bytes. +const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c; +const MANIFEST_VERSION: u64 = 1; + +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)] +pub struct ManifestHeader { + magic_number: u64, + version: u64, +} + +const MANIFEST_HEADER_LEN: usize = 16; + +impl ManifestHeader { + fn encode(&self) -> BytesMut { + let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN); + buf.put_u64(self.magic_number); + buf.put_u64(self.version); + buf + } + + fn decode(mut buf: &[u8]) -> Self { + assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header"); + Self { + magic_number: buf.get_u64(), + version: buf.get_u64(), + } + } +} + +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)] +pub enum Operation { + /// A snapshot of the current state. + /// + /// Lsn field represents the LSN that is persisted to disk for this snapshot. + Snapshot(Snapshot, Lsn), + /// An atomic operation that changes the state. + /// + /// Lsn field represents the LSN that is persisted to disk after the operation is done. + /// This will only change when new L0 is flushed to the disk. + Operation(Vec, Lsn), +} + +struct RecordHeader { + size: u32, + checksum: u32, +} + +const RECORD_HEADER_LEN: usize = 8; + +impl RecordHeader { + fn encode(&self) -> BytesMut { + let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN); + buf.put_u32(self.size); + buf.put_u32(self.checksum); + buf + } + + fn decode(mut buf: &[u8]) -> Self { + assert!(buf.len() == RECORD_HEADER_LEN, "invalid header"); + Self { + size: buf.get_u32(), + checksum: buf.get_u32(), + } + } +} + +#[derive(Debug, thiserror::Error)] +pub enum ManifestLoadError { + #[error("manifest header is corrupted")] + CorruptedManifestHeader, + #[error("unsupported manifest version: got {0}, expected {1}")] + UnsupportedVersion(u64, u64), + #[error("error when decoding record: {0}")] + DecodeRecord(serde_json::Error), + #[error("I/O error: {0}")] + Io(io::Error), +} + +#[must_use = "Should check if the manifest is partially corrupted"] +pub struct ManifestPartiallyCorrupted(bool); + +impl Manifest { + /// Create a new manifest by writing the manifest header and a snapshot record to the given file. + pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result { + let mut manifest = Self { file }; + manifest.append_manifest_header(ManifestHeader { + magic_number: MANIFEST_MAGIC_NUMBER, + version: MANIFEST_VERSION, + })?; + manifest.append_operation(Operation::Snapshot(snapshot, lsn))?; + Ok(manifest) + } + + /// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted, + /// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and + /// backup the current one. + pub fn load( + mut file: VirtualFile, + ) -> Result<(Self, Vec, ManifestPartiallyCorrupted), ManifestLoadError> { + let mut buf = vec![]; + file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?; + + // Read manifest header + let mut buf = Bytes::from(buf); + if buf.remaining() < MANIFEST_HEADER_LEN { + return Err(ManifestLoadError::CorruptedManifestHeader); + } + let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]); + buf.advance(MANIFEST_HEADER_LEN); + if header.version != MANIFEST_VERSION { + return Err(ManifestLoadError::UnsupportedVersion( + header.version, + MANIFEST_VERSION, + )); + } + + // Read operations + let mut operations = Vec::new(); + let corrupted = loop { + if buf.remaining() == 0 { + break false; + } + if buf.remaining() < RECORD_HEADER_LEN { + warn!("incomplete header when decoding manifest, could be corrupted"); + break true; + } + let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]); + let size = size as usize; + buf.advance(RECORD_HEADER_LEN); + if buf.remaining() < size { + warn!("incomplete data when decoding manifest, could be corrupted"); + break true; + } + let data = &buf[..size]; + if crc32c(data) != checksum { + warn!("checksum mismatch when decoding manifest, could be corrupted"); + break true; + } + // if the following decode fails, we cannot use the manifest or safely ignore any record. + operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?); + buf.advance(size); + }; + Ok(( + Self { file }, + operations, + ManifestPartiallyCorrupted(corrupted), + )) + } + + fn append_data(&mut self, data: &[u8]) -> Result<()> { + if data.len() >= u32::MAX as usize { + panic!("data too large"); + } + let header = RecordHeader { + size: data.len() as u32, + checksum: crc32c(data), + }; + let header = header.encode(); + self.file.write_all(&header)?; + self.file.write_all(data)?; + self.file.sync_all()?; + Ok(()) + } + + fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> { + let encoded = header.encode(); + self.file.write_all(&encoded)?; + Ok(()) + } + + /// Add an operation to the manifest. The operation will be appended to the end of the file, + /// and the file will fsync. + pub fn append_operation(&mut self, operation: Operation) -> Result<()> { + let encoded = Vec::from(serde_json::to_string(&operation)?); + self.append_data(&encoded) + } +} + +#[cfg(test)] +mod tests { + use std::fs::OpenOptions; + + use crate::repository::Key; + + use super::*; + + #[test] + fn test_read_manifest() { + let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest"); + std::fs::create_dir_all(&testdir).unwrap(); + let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap(); + let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233)); + let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333)); + let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333)); + let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333)); + + // Write a manifest with a snapshot and some operations + let snapshot = Snapshot { + layers: vec![layer1, layer2], + }; + let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap(); + manifest + .append_operation(Operation::Operation( + vec![Record::AddLayer(layer3.clone())], + Lsn::from(1), + )) + .unwrap(); + drop(manifest); + + // Open the second time and write + let file = VirtualFile::open_with_options( + &testdir.join("MANIFEST"), + OpenOptions::new() + .read(true) + .write(true) + .create_new(false) + .truncate(false), + ) + .unwrap(); + let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap(); + assert!(!corrupted.0); + assert_eq!(operations.len(), 2); + assert_eq!( + &operations[0], + &Operation::Snapshot(snapshot.clone(), Lsn::from(0)) + ); + assert_eq!( + &operations[1], + &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1)) + ); + manifest + .append_operation(Operation::Operation( + vec![ + Record::RemoveLayer(layer3.clone()), + Record::AddLayer(layer4.clone()), + ], + Lsn::from(2), + )) + .unwrap(); + drop(manifest); + + // Open the third time and verify + let file = VirtualFile::open_with_options( + &testdir.join("MANIFEST"), + OpenOptions::new() + .read(true) + .write(true) + .create_new(false) + .truncate(false), + ) + .unwrap(); + let (_manifest, operations, corrupted) = Manifest::load(file).unwrap(); + assert!(!corrupted.0); + assert_eq!(operations.len(), 3); + assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0))); + assert_eq!( + &operations[1], + &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1)) + ); + assert_eq!( + &operations[2], + &Operation::Operation( + vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)], + Lsn::from(2) + ) + ); + } +} diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 53d69a15dc..7e123c3fbd 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -10,6 +10,7 @@ use tokio::fs; use anyhow::Context; use once_cell::sync::Lazy; use tokio::sync::RwLock; +use tokio::task::JoinSet; use tracing::*; use remote_storage::GenericRemoteStorage; @@ -20,7 +21,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::TenantConfOpt; use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState}; -use crate::IGNORED_TENANT_FILE_NAME; +use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME}; use utils::fs_ext::PathExt; use utils::id::{TenantId, TimelineId}; @@ -58,10 +59,12 @@ static TENANTS: Lazy> = Lazy::new(|| RwLock::new(TenantsMap:: /// Initialize repositories with locally available timelines. /// Timelines that are only partially available locally (remote storage has more data than this pageserver) /// are scheduled for download and added to the tenant once download is completed. -#[instrument(skip(conf, remote_storage))] +#[instrument(skip_all)] pub async fn init_tenant_mgr( conf: &'static PageServerConf, + broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, + init_order: InitializationOrder, ) -> anyhow::Result<()> { // Scan local filesystem for attached tenants let tenants_dir = conf.tenants_path(); @@ -116,7 +119,9 @@ pub async fn init_tenant_mgr( match schedule_local_tenant_processing( conf, &tenant_dir_path, + broker_client.clone(), remote_storage.clone(), + Some(init_order.clone()), &ctx, ) { Ok(tenant) => { @@ -150,7 +155,9 @@ pub async fn init_tenant_mgr( pub fn schedule_local_tenant_processing( conf: &'static PageServerConf, tenant_path: &Path, + broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, + init_order: Option, ctx: &RequestContext, ) -> anyhow::Result> { anyhow::ensure!( @@ -186,7 +193,7 @@ pub fn schedule_local_tenant_processing( let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() { info!("tenant {tenant_id} has attaching mark file, resuming its attach operation"); if let Some(remote_storage) = remote_storage { - match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) { + match Tenant::spawn_attach(conf, tenant_id, broker_client, remote_storage, ctx) { Ok(tenant) => tenant, Err(e) => { error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}"); @@ -204,7 +211,14 @@ pub fn schedule_local_tenant_processing( } else { info!("tenant {tenant_id} is assumed to be loadable, starting load operation"); // Start loading the tenant into memory. It will initially be in Loading state. - Tenant::spawn_load(conf, tenant_id, remote_storage, ctx) + Tenant::spawn_load( + conf, + tenant_id, + broker_client, + remote_storage, + init_order, + ctx, + ) }; Ok(tenant) } @@ -219,6 +233,7 @@ pub fn schedule_local_tenant_processing( /// That could be easily misinterpreted by control plane, the consumer of the /// management API. For example, it could attach the tenant on a different pageserver. /// We would then be in split-brain once this pageserver restarts. +#[instrument] pub async fn shutdown_all_tenants() { // Prevent new tenants from being created. let tenants_to_shut_down = { @@ -235,39 +250,51 @@ pub async fn shutdown_all_tenants() { tenants_clone } TenantsMap::ShuttingDown(_) => { + // TODO: it is possible that detach and shutdown happen at the same time. as a + // result, during shutdown we do not wait for detach. error!("already shutting down, this function isn't supposed to be called more than once"); return; } } }; - let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len()); - for (_, tenant) in tenants_to_shut_down { - if tenant.is_active() { - // updates tenant state, forbidding new GC and compaction iterations from starting - tenant.set_stopping(); - tenants_to_freeze_and_flush.push(tenant); + let mut join_set = JoinSet::new(); + for (tenant_id, tenant) in tenants_to_shut_down { + join_set.spawn( + async move { + let freeze_and_flush = true; + + match tenant.shutdown(freeze_and_flush).await { + Ok(()) => debug!("tenant successfully stopped"), + Err(super::ShutdownError::AlreadyStopping) => { + warn!("tenant was already shutting down") + } + } + } + .instrument(info_span!("shutdown", %tenant_id)), + ); + } + + let mut panicked = 0; + + while let Some(res) = join_set.join_next().await { + match res { + Ok(()) => {} + Err(join_error) if join_error.is_cancelled() => { + unreachable!("we are not cancelling any of the futures"); + } + Err(join_error) if join_error.is_panic() => { + // cannot really do anything, as this panic is likely a bug + panicked += 1; + } + Err(join_error) => { + warn!("unknown kind of JoinError: {join_error}"); + } } } - // Shut down all existing walreceiver connections and stop accepting the new ones. - task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await; - - // Ok, no background tasks running anymore. Flush any remaining data in - // memory to disk. - // - // We assume that any incoming connections that might request pages from - // the tenant have already been terminated by the caller, so there - // should be no more activity in any of the repositories. - // - // On error, log it but continue with the shutdown for other tenants. - for tenant in tenants_to_freeze_and_flush { - let tenant_id = tenant.tenant_id(); - debug!("shutdown tenant {tenant_id}"); - - if let Err(err) = tenant.freeze_and_flush().await { - error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}"); - } + if panicked > 0 { + warn!(panicked, "observed panicks while shutting down tenants"); } } @@ -275,6 +302,7 @@ pub async fn create_tenant( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: TenantId, + broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, ctx: &RequestContext, ) -> Result, TenantMapInsertError> { @@ -287,7 +315,7 @@ pub async fn create_tenant( // See https://github.com/neondatabase/neon/issues/4233 let created_tenant = - schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?; + schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 @@ -300,11 +328,19 @@ pub async fn create_tenant( }).await } +#[derive(Debug, thiserror::Error)] +pub enum SetNewTenantConfigError { + #[error(transparent)] + GetTenant(#[from] GetTenantError), + #[error(transparent)] + Persist(anyhow::Error), +} + pub async fn set_new_tenant_config( conf: &'static PageServerConf, new_tenant_conf: TenantConfOpt, tenant_id: TenantId, -) -> Result<(), TenantStateError> { +) -> Result<(), SetNewTenantConfigError> { info!("configuring tenant {tenant_id}"); let tenant = get_tenant(tenant_id, true).await?; @@ -314,23 +350,32 @@ pub async fn set_new_tenant_config( &tenant_config_path, new_tenant_conf, false, - )?; + ) + .map_err(SetNewTenantConfigError::Persist)?; tenant.set_new_tenant_config(new_tenant_conf); Ok(()) } +#[derive(Debug, thiserror::Error)] +pub enum GetTenantError { + #[error("Tenant {0} not found")] + NotFound(TenantId), + #[error("Tenant {0} is not active")] + NotActive(TenantId), +} + /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. pub async fn get_tenant( tenant_id: TenantId, active_only: bool, -) -> Result, TenantStateError> { +) -> Result, GetTenantError> { let m = TENANTS.read().await; let tenant = m .get(&tenant_id) - .ok_or(TenantStateError::NotFound(tenant_id))?; + .ok_or(GetTenantError::NotFound(tenant_id))?; if active_only && !tenant.is_active() { - Err(TenantStateError::NotActive(tenant_id)) + Err(GetTenantError::NotActive(tenant_id)) } else { Ok(Arc::clone(tenant)) } @@ -339,7 +384,7 @@ pub async fn get_tenant( #[derive(Debug, thiserror::Error)] pub enum DeleteTimelineError { #[error("Tenant {0}")] - Tenant(#[from] TenantStateError), + Tenant(#[from] GetTenantError), #[error("Timeline {0}")] Timeline(#[from] crate::tenant::DeleteTimelineError), @@ -351,7 +396,9 @@ pub async fn delete_timeline( ctx: &RequestContext, ) -> Result<(), DeleteTimelineError> { let tenant = get_tenant(tenant_id, true).await?; - tenant.delete_timeline(timeline_id, ctx).await?; + tenant + .prepare_and_schedule_delete_timeline(timeline_id, ctx) + .await?; Ok(()) } @@ -404,6 +451,7 @@ pub async fn detach_tenant( pub async fn load_tenant( conf: &'static PageServerConf, tenant_id: TenantId, + broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { @@ -415,7 +463,7 @@ pub async fn load_tenant( .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?; } - let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage, ctx) + let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx) .with_context(|| { format!("Failed to schedule tenant processing in path {tenant_path:?}") })?; @@ -472,6 +520,7 @@ pub async fn attach_tenant( conf: &'static PageServerConf, tenant_id: TenantId, tenant_conf: TenantConfOpt, + broker_client: storage_broker::BrokerClientChannel, remote_storage: GenericRemoteStorage, ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { @@ -487,7 +536,7 @@ pub async fn attach_tenant( .context("check for attach marker file existence")?; anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file"); - let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, Some(remote_storage), ctx)?; + let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 @@ -563,25 +612,26 @@ where // The exclusive lock here ensures we don't miss the tenant state updates before trying another removal. // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to // avoid holding the lock for the entire process. - { - let tenants_accessor = TENANTS.write().await; - match tenants_accessor.get(&tenant_id) { - Some(tenant) => match tenant.current_state() { - TenantState::Attaching - | TenantState::Loading - | TenantState::Broken { .. } - | TenantState::Active => tenant.set_stopping(), - TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)), - }, - None => return Err(TenantStateError::NotFound(tenant_id)), + let tenant = { + TENANTS + .write() + .await + .get(&tenant_id) + .cloned() + .ok_or(TenantStateError::NotFound(tenant_id))? + }; + + let freeze_and_flush = false; + + // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so + // that we can continue safely to cleanup. + match tenant.shutdown(freeze_and_flush).await { + Ok(()) => {} + Err(super::ShutdownError::AlreadyStopping) => { + return Err(TenantStateError::IsStopping(tenant_id)) } } - // shutdown all tenant and timeline tasks: gc, compaction, page service) - // No new tasks will be started for this tenant because it's in `Stopping` state. - // Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely. - task_mgr::shutdown_tasks(None, Some(tenant_id), None).await; - match tenant_cleanup .await .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}")) @@ -597,7 +647,7 @@ where let tenants_accessor = TENANTS.read().await; match tenants_accessor.get(&tenant_id) { Some(tenant) => { - tenant.set_broken(e.to_string()); + tenant.set_broken(e.to_string()).await; } None => { warn!("Tenant {tenant_id} got removed from memory"); @@ -663,7 +713,6 @@ pub async fn immediate_gc( Ok(wait_task_done) } -#[cfg(feature = "testing")] pub async fn immediate_compact( tenant_id: TenantId, timeline_id: TimelineId, diff --git a/pageserver/src/tenant/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs index 0b0217ab58..3cbcfe8774 100644 --- a/pageserver/src/tenant/par_fsync.rs +++ b/pageserver/src/tenant/par_fsync.rs @@ -19,14 +19,8 @@ fn parallel_worker(paths: &[PathBuf], next_path_idx: &AtomicUsize) -> io::Result Ok(()) } -pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> { - const PARALLEL_PATH_THRESHOLD: usize = 1; - if paths.len() <= PARALLEL_PATH_THRESHOLD { - for path in paths { - fsync_path(path)?; - } - return Ok(()); - } +fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> { + // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything. /// Use at most this number of threads. /// Increasing this limit will @@ -36,11 +30,11 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> { let num_threads = paths.len().min(MAX_NUM_THREADS); let next_path_idx = AtomicUsize::new(0); - crossbeam_utils::thread::scope(|s| -> io::Result<()> { + std::thread::scope(|s| -> io::Result<()> { let mut handles = vec![]; // Spawn `num_threads - 1`, as the current thread is also a worker. for _ in 1..num_threads { - handles.push(s.spawn(|_| parallel_worker(paths, &next_path_idx))); + handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx))); } parallel_worker(paths, &next_path_idx)?; @@ -51,5 +45,41 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> { Ok(()) }) - .unwrap() +} + +/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool. +pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> { + if paths.len() == 1 { + fsync_path(&paths[0])?; + return Ok(()); + } + + fsync_in_thread_pool(paths) +} + +/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current +/// execution thread. Otherwise, we will spawn_blocking and run it in tokio. +pub async fn par_fsync_async(paths: &[PathBuf]) -> io::Result<()> { + const MAX_CONCURRENT_FSYNC: usize = 64; + let mut next = paths.iter().peekable(); + let mut js = tokio::task::JoinSet::new(); + loop { + while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() { + let next = next.next().expect("just peeked"); + let next = next.to_owned(); + js.spawn_blocking(move || fsync_path(&next)); + } + + // now the joinset has been filled up, wait for next to complete + if let Some(res) = js.join_next().await { + res??; + } else { + // last item had already completed + assert!( + next.peek().is_none(), + "joinset emptied, we shouldn't have more work" + ); + return Ok(()); + } + } } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 96aabd7945..2936e7a4af 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -210,13 +210,15 @@ use chrono::{NaiveDateTime, Utc}; pub use download::{is_temp_download_file, list_remote_timelines}; use scopeguard::ScopeGuard; +use std::collections::{HashMap, VecDeque}; +use std::path::Path; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; -use remote_storage::{DownloadError, GenericRemoteStorage}; +use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath}; use std::ops::DerefMut; use tokio::runtime::Runtime; -use tracing::{debug, error, info, warn}; +use tracing::{debug, error, info, instrument, warn}; use tracing::{info_span, Instrument}; use utils::lsn::Lsn; @@ -225,7 +227,9 @@ use crate::metrics::{ RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS, }; +use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::upload_queue::Delete; use crate::{ config::PageServerConf, task_mgr, @@ -259,7 +263,7 @@ const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; pub enum MaybeDeletedIndexPart { IndexPart(IndexPart), - Deleted, + Deleted(IndexPart), } /// Errors that can arise when calling [`RemoteTimelineClient::stop`]. @@ -361,11 +365,42 @@ impl RemoteTimelineClient { Ok(()) } + /// Initialize the queue in stopped state. Used in startup path + /// to continue deletion operation interrupted by pageserver crash or restart. + pub fn init_upload_queue_stopped_to_continue_deletion( + &self, + index_part: &IndexPart, + ) -> anyhow::Result<()> { + // FIXME: consider newtype for DeletedIndexPart. + let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!( + "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted" + ))?; + + { + let mut upload_queue = self.upload_queue.lock().unwrap(); + upload_queue.initialize_with_current_remote_index_part(index_part)?; + self.update_remote_physical_size_gauge(Some(index_part)); + } + // also locks upload queue, without dropping the guard above it will be a deadlock + self.stop().expect("initialized line above"); + + let mut upload_queue = self.upload_queue.lock().unwrap(); + + upload_queue + .stopped_mut() + .expect("stopped above") + .deleted_at = SetDeletedFlagProgress::Successful(deleted_at); + + Ok(()) + } + pub fn last_uploaded_consistent_lsn(&self) -> Option { match &*self.upload_queue.lock().unwrap() { UploadQueue::Uninitialized => None, UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn), - UploadQueue::Stopped(q) => Some(q.last_uploaded_consistent_lsn), + UploadQueue::Stopped(q) => { + Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn) + } } } @@ -420,7 +455,7 @@ impl RemoteTimelineClient { .await?; if index_part.deleted_at.is_some() { - Ok(MaybeDeletedIndexPart::Deleted) + Ok(MaybeDeletedIndexPart::Deleted(index_part)) } else { Ok(MaybeDeletedIndexPart::IndexPart(index_part)) } @@ -622,7 +657,11 @@ impl RemoteTimelineClient { // schedule the actual deletions for name in names { - let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone()); + let op = UploadOp::Delete(Delete { + file_kind: RemoteOpFileKind::Layer, + layer_file_name: name.clone(), + scheduled_from_timeline_delete: false, + }); self.calls_unfinished_metric_begin(&op); upload_queue.queued_operations.push_back(op); info!("scheduled layer file deletion {}", name.file_name()); @@ -639,18 +678,11 @@ impl RemoteTimelineClient { /// Wait for all previously scheduled uploads/deletions to complete /// pub async fn wait_completion(self: &Arc) -> anyhow::Result<()> { - let (sender, mut receiver) = tokio::sync::watch::channel(()); - let barrier_op = UploadOp::Barrier(sender); - - { + let mut receiver = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - upload_queue.queued_operations.push_back(barrier_op); - // Don't count this kind of operation! - - // Launch the task immediately, if possible - self.launch_queued_tasks(upload_queue); - } + self.schedule_barrier(upload_queue) + }; if receiver.changed().await.is_err() { anyhow::bail!("wait_completion aborted because upload queue was stopped"); @@ -658,6 +690,22 @@ impl RemoteTimelineClient { Ok(()) } + fn schedule_barrier( + self: &Arc, + upload_queue: &mut UploadQueueInitialized, + ) -> tokio::sync::watch::Receiver<()> { + let (sender, receiver) = tokio::sync::watch::channel(()); + let barrier_op = UploadOp::Barrier(sender); + + upload_queue.queued_operations.push_back(barrier_op); + // Don't count this kind of operation! + + // Launch the task immediately, if possible + self.launch_queued_tasks(upload_queue); + + receiver + } + /// Set the deleted_at field in the remote index file. /// /// This fails if the upload queue has not been `stop()`ed. @@ -665,6 +713,7 @@ impl RemoteTimelineClient { /// The caller is responsible for calling `stop()` AND for waiting /// for any ongoing upload tasks to finish after `stop()` has succeeded. /// Check method [`RemoteTimelineClient::stop`] for details. + #[instrument(skip_all)] pub(crate) async fn persist_index_part_with_deleted_flag( self: &Arc, ) -> Result<(), PersistIndexPartWithDeletedFlagError> { @@ -674,15 +723,7 @@ impl RemoteTimelineClient { // We must be in stopped state because otherwise // we can have inprogress index part upload that can overwrite the file // with missing is_deleted flag that we going to set below - let stopped = match &mut *locked { - UploadQueue::Uninitialized => { - return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into()) - } - UploadQueue::Initialized(_) => { - return Err(anyhow::anyhow!("is not Stopped but Initialized").into()) - } - UploadQueue::Stopped(stopped) => stopped, - }; + let stopped = locked.stopped_mut()?; match stopped.deleted_at { SetDeletedFlagProgress::NotRunning => (), // proceed @@ -696,27 +737,17 @@ impl RemoteTimelineClient { let deleted_at = Utc::now().naive_utc(); stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at); - let mut index_part = IndexPart::new( - stopped.latest_files.clone(), - stopped.last_uploaded_consistent_lsn, - stopped - .latest_metadata - .to_bytes() - .context("serialize metadata")?, - ); + let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion) + .context("IndexPart serialize")?; index_part.deleted_at = Some(deleted_at); index_part }; let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| { let mut locked = self_clone.upload_queue.lock().unwrap(); - let stopped = match &mut *locked { - UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!( - "there's no way out of Stopping, and we checked it's Stopping above: {:?}", - locked.as_str(), - ), - UploadQueue::Stopped(stopped) => stopped, - }; + let stopped = locked + .stopped_mut() + .expect("there's no way out of Stopping, and we checked it's Stopping above"); stopped.deleted_at = SetDeletedFlagProgress::NotRunning; }); @@ -751,13 +782,10 @@ impl RemoteTimelineClient { ScopeGuard::into_inner(undo_deleted_at); { let mut locked = self.upload_queue.lock().unwrap(); - let stopped = match &mut *locked { - UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!( - "there's no way out of Stopping, and we checked it's Stopping above: {:?}", - locked.as_str(), - ), - UploadQueue::Stopped(stopped) => stopped, - }; + + let stopped = locked + .stopped_mut() + .expect("there's no way out of Stopping, and we checked it's Stopping above"); stopped.deleted_at = SetDeletedFlagProgress::Successful( index_part_with_deleted_at .deleted_at @@ -768,6 +796,92 @@ impl RemoteTimelineClient { Ok(()) } + /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set. + /// The function deletes layer files one by one, then lists the prefix to see if we leaked something + /// deletes leaked files if any and proceeds with deletion of index file at the end. + pub(crate) async fn delete_all(self: &Arc) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_and_timeline_id(); + + let (mut receiver, deletions_queued) = { + let mut deletions_queued = 0; + + let mut locked = self.upload_queue.lock().unwrap(); + let stopped = locked.stopped_mut()?; + + if !matches!(stopped.deleted_at, SetDeletedFlagProgress::Successful(_)) { + anyhow::bail!("deleted_at is not set") + } + + debug_assert!(stopped.upload_queue_for_deletion.no_pending_work()); + + stopped + .upload_queue_for_deletion + .queued_operations + .reserve(stopped.upload_queue_for_deletion.latest_files.len()); + + // schedule the actual deletions + for name in stopped.upload_queue_for_deletion.latest_files.keys() { + let op = UploadOp::Delete(Delete { + file_kind: RemoteOpFileKind::Layer, + layer_file_name: name.clone(), + scheduled_from_timeline_delete: true, + }); + self.calls_unfinished_metric_begin(&op); + stopped + .upload_queue_for_deletion + .queued_operations + .push_back(op); + + info!("scheduled layer file deletion {}", name.file_name()); + deletions_queued += 1; + } + + self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion); + + ( + self.schedule_barrier(&mut stopped.upload_queue_for_deletion), + deletions_queued, + ) + }; + + receiver.changed().await?; + + // Do not delete index part yet, it is needed for possible retry. If we remove it first + // and retry will arrive to different pageserver there wont be any traces of it on remote storage + let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + let timeline_storage_path = self.conf.remote_path(&timeline_path)?; + + let remaining = self + .storage_impl + .list_prefixes(Some(&timeline_storage_path)) + .await?; + + let remaining: Vec = remaining + .into_iter() + .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME)) + .collect(); + + if !remaining.is_empty() { + warn!( + "Found {} files not bound to index_file.json, proceeding with their deletion", + remaining.len() + ); + for file in remaining { + warn!("Removing {}", file.object_name().unwrap_or_default()); + self.storage_impl.delete(&file).await?; + } + } + + let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME)); + + debug!("deleting index part"); + self.storage_impl.delete(&index_file_path).await?; + + info!(deletions_queued, "done deleting, including index_part.json"); + + Ok(()) + } + /// /// Pick next tasks from the queue, and start as many of them as possible without violating /// the ordering constraints. @@ -786,7 +900,7 @@ impl RemoteTimelineClient { // have finished. upload_queue.inprogress_tasks.is_empty() } - UploadOp::Delete(_, _) => { + UploadOp::Delete(_) => { // Wait for preceding uploads to finish. Concurrent deletions are OK, though. upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len() } @@ -817,7 +931,7 @@ impl RemoteTimelineClient { UploadOp::UploadMetadata(_, _) => { upload_queue.num_inprogress_metadata_uploads += 1; } - UploadOp::Delete(_, _) => { + UploadOp::Delete(_) => { upload_queue.num_inprogress_deletions += 1; } UploadOp::Barrier(sender) => { @@ -891,7 +1005,6 @@ impl RemoteTimelineClient { unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back") } } - self.calls_unfinished_metric_end(&task.op); return; } @@ -937,16 +1050,16 @@ impl RemoteTimelineClient { } res } - UploadOp::Delete(metric_file_kind, ref layer_file_name) => { + UploadOp::Delete(delete) => { let path = &self .conf .timeline_path(&self.timeline_id, &self.tenant_id) - .join(layer_file_name.file_name()); + .join(delete.layer_file_name.file_name()); delete::delete_layer(self.conf, &self.storage_impl, path) .measure_remote_op( self.tenant_id, self.timeline_id, - *metric_file_kind, + delete.file_kind, RemoteOpKind::Delete, Arc::clone(&self.metrics), ) @@ -1012,11 +1125,24 @@ impl RemoteTimelineClient { let mut upload_queue_guard = self.upload_queue.lock().unwrap(); let upload_queue = match upload_queue_guard.deref_mut() { UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"), - UploadQueue::Stopped(_) => { + UploadQueue::Stopped(stopped) => { + // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion) + // then stop() took care of it so we just return. + // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc. + match &task.op { + UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion), + _ => None + } + }, + UploadQueue::Initialized(qi) => { Some(qi) } + }; + + let upload_queue = match upload_queue { + Some(upload_queue) => upload_queue, + None => { info!("another concurrent task already stopped the queue"); return; - }, // nothing to do - UploadQueue::Initialized(qi) => { qi } + } }; upload_queue.inprogress_tasks.remove(&task.task_id); @@ -1029,7 +1155,7 @@ impl RemoteTimelineClient { upload_queue.num_inprogress_metadata_uploads -= 1; upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check? } - UploadOp::Delete(_, _) => { + UploadOp::Delete(_) => { upload_queue.num_inprogress_deletions -= 1; } UploadOp::Barrier(_) => unreachable!(), @@ -1063,8 +1189,8 @@ impl RemoteTimelineClient { reason: "metadata uploads are tiny", }, ), - UploadOp::Delete(file_kind, _) => ( - *file_kind, + UploadOp::Delete(delete) => ( + delete.file_kind, RemoteOpKind::Delete, DontTrackSize { reason: "should we track deletes? positive or negative sign?", @@ -1111,32 +1237,36 @@ impl RemoteTimelineClient { info!("another concurrent task already shut down the queue"); Ok(()) } - UploadQueue::Initialized(UploadQueueInitialized { - latest_files, - latest_metadata, - last_uploaded_consistent_lsn, - .. - }) => { + UploadQueue::Initialized(initialized) => { info!("shutting down upload queue"); // Replace the queue with the Stopped state, taking ownership of the old // Initialized queue. We will do some checks on it, and then drop it. let qi = { - // take or clone what we need - let latest_files = std::mem::take(latest_files); - let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn; - // this could be Copy - let latest_metadata = latest_metadata.clone(); - - let stopped = UploadQueueStopped { - latest_files, - last_uploaded_consistent_lsn, - latest_metadata, - deleted_at: SetDeletedFlagProgress::NotRunning, + // Here we preserve working version of the upload queue for possible use during deletions. + // In-place replace of Initialized to Stopped can be done with the help of https://github.com/Sgeo/take_mut + // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point. + // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it. + let upload_queue_for_deletion = UploadQueueInitialized { + task_counter: 0, + latest_files: initialized.latest_files.clone(), + latest_files_changes_since_metadata_upload_scheduled: 0, + latest_metadata: initialized.latest_metadata.clone(), + last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn, + num_inprogress_layer_uploads: 0, + num_inprogress_metadata_uploads: 0, + num_inprogress_deletions: 0, + inprogress_tasks: HashMap::default(), + queued_operations: VecDeque::default(), }; - let upload_queue = - std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped)); + let upload_queue = std::mem::replace( + &mut *guard, + UploadQueue::Stopped(UploadQueueStopped { + upload_queue_for_deletion, + deleted_at: SetDeletedFlagProgress::NotRunning, + }), + ); if let UploadQueue::Initialized(qi) = upload_queue { qi } else { @@ -1144,8 +1274,6 @@ impl RemoteTimelineClient { } }; - assert!(qi.latest_files.is_empty(), "do not use this anymore"); - // consistency check assert_eq!( qi.num_inprogress_layer_uploads @@ -1264,9 +1392,7 @@ mod tests { let harness = TenantHarness::create(test_name)?; let (tenant, ctx) = runtime.block_on(harness.load()); // create an empty timeline directory - let timeline = - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let _ = timeline.initialize(&ctx).unwrap(); + let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; let remote_fs_dir = harness.conf.workdir.join("remote_fs"); std::fs::create_dir_all(remote_fs_dir)?; @@ -1410,7 +1536,7 @@ mod tests { // Download back the index.json, and check that the list of files is correct let index_part = match runtime.block_on(client.download_index_file())? { MaybeDeletedIndexPart::IndexPart(index_part) => index_part, - MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"), + MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"), }; assert_file_list( diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 7a06e57a6b..c3f6dcadec 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -7,9 +7,11 @@ use std::collections::{HashMap, HashSet}; use chrono::NaiveDateTime; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; +use utils::bin_ser::SerializeError; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::upload_queue::UploadQueueInitialized; use utils::lsn::Lsn; @@ -115,6 +117,21 @@ impl IndexPart { } } +impl TryFrom<&UploadQueueInitialized> for IndexPart { + type Error = SerializeError; + + fn try_from(upload_queue: &UploadQueueInitialized) -> Result { + let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); + let metadata_bytes = upload_queue.latest_metadata.to_bytes()?; + + Ok(Self::new( + upload_queue.latest_files.clone(), + disk_consistent_lsn, + metadata_bytes, + )) + } +} + /// Serialized form of [`LayerFileMetadata`]. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] pub struct IndexLayerMetadata { diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index d30d6c5c6e..6ac4fd9470 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -4,6 +4,7 @@ pub mod delta_layer; mod filename; mod image_layer; mod inmemory_layer; +mod layer_desc; mod remote_layer; use crate::config::PageServerConf; @@ -37,6 +38,7 @@ pub use delta_layer::{DeltaLayer, DeltaLayerWriter}; pub use filename::{DeltaFileName, ImageFileName, LayerFileName}; pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; +pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; pub use remote_layer::RemoteLayer; use super::layer_map::BatchedUpdates; @@ -406,14 +408,23 @@ pub type LayerKeyIter<'i> = Box + 'i>; /// An image layer is a snapshot of all the data in a key-range, at a single /// LSN. pub trait PersistentLayer: Layer { - fn get_tenant_id(&self) -> TenantId; + /// Get the layer descriptor. + fn layer_desc(&self) -> &PersistentLayerDesc; + + fn get_tenant_id(&self) -> TenantId { + self.layer_desc().tenant_id + } /// Identify the timeline this layer belongs to - fn get_timeline_id(&self) -> TimelineId; + fn get_timeline_id(&self) -> TimelineId { + self.layer_desc().timeline_id + } /// File name used for this layer, both in the pageserver's local filesystem /// state as well as in the remote storage. - fn filename(&self) -> LayerFileName; + fn filename(&self) -> LayerFileName { + self.layer_desc().filename() + } // Path to the layer file in the local filesystem. // `None` for `RemoteLayer`. @@ -443,7 +454,9 @@ pub trait PersistentLayer: Layer { /// /// Should not change over the lifetime of the layer object because /// current_physical_size is computed as the som of this value. - fn file_size(&self) -> u64; + fn file_size(&self) -> u64 { + self.layer_desc().file_size + } fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo; @@ -472,6 +485,20 @@ pub struct LayerDescriptor { pub short_id: String, } +impl LayerDescriptor { + /// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta, + /// and the tenant / timeline id does not matter. + pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc { + PersistentLayerDesc::new_delta( + TenantId::from_array([0; 16]), + TimelineId::from_array([0; 16]), + self.key.clone(), + self.lsn.clone(), + 233, + ) + } +} + impl Layer for LayerDescriptor { fn get_key_range(&self) -> Range { self.key.clone() @@ -542,7 +569,7 @@ impl From for LayerDescriptor { /// /// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the /// global config, and paths to layer files are constructed using the tenant/timeline -/// path from the config. But in the 'pageserver_binutils' binary, we need to construct a Layer +/// path from the config. But in the 'pagectl' binary, we need to construct a Layer /// struct for a file on disk, without having a page server running, so that we have no /// config. In that case, we use the Path variant to hold the full path to the file on /// disk. diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index ba3ab6dd4c..624fe8dac4 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -56,8 +56,8 @@ use utils::{ }; use super::{ - DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter, - LayerKeyIter, PathOrConf, + DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter, + PathOrConf, PersistentLayerDesc, }; /// @@ -89,10 +89,10 @@ impl From<&DeltaLayer> for Summary { magic: DELTA_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenant_id: layer.tenant_id, - timeline_id: layer.timeline_id, - key_range: layer.key_range.clone(), - lsn_range: layer.lsn_range.clone(), + tenant_id: layer.desc.tenant_id, + timeline_id: layer.desc.timeline_id, + key_range: layer.desc.key_range.clone(), + lsn_range: layer.desc.lsn_range.clone(), index_start_blk: 0, index_root_blk: 0, @@ -110,7 +110,7 @@ const WILL_INIT: u64 = 1; /// reading/deserializing records themselves. /// #[derive(Debug, Serialize, Deserialize, Copy, Clone)] -struct BlobRef(u64); +pub struct BlobRef(pub u64); impl BlobRef { pub fn will_init(&self) -> bool { @@ -180,12 +180,7 @@ impl DeltaKey { pub struct DeltaLayer { path_or_conf: PathOrConf, - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub key_range: Range, - pub lsn_range: Range, - - pub file_size: u64, + pub desc: PersistentLayerDesc, access_stats: LayerAccessStats, @@ -197,9 +192,9 @@ impl std::fmt::Debug for DeltaLayer { use super::RangeDisplayDebug; f.debug_struct("DeltaLayer") - .field("key_range", &RangeDisplayDebug(&self.key_range)) - .field("lsn_range", &self.lsn_range) - .field("file_size", &self.file_size) + .field("key_range", &RangeDisplayDebug(&self.desc.key_range)) + .field("lsn_range", &self.desc.lsn_range) + .field("file_size", &self.desc.file_size) .field("inner", &self.inner) .finish() } @@ -228,30 +223,16 @@ impl std::fmt::Debug for DeltaLayerInner { } impl Layer for DeltaLayer { - fn get_key_range(&self) -> Range { - self.key_range.clone() - } - - fn get_lsn_range(&self) -> Range { - self.lsn_range.clone() - } - fn is_incremental(&self) -> bool { - true - } - - fn short_id(&self) -> String { - self.filename().file_name() - } /// debugging function to print out the contents of the layer fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { println!( "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----", - self.tenant_id, - self.timeline_id, - self.key_range.start, - self.key_range.end, - self.lsn_range.start, - self.lsn_range.end + self.desc.tenant_id, + self.desc.timeline_id, + self.desc.key_range.start, + self.desc.key_range.end, + self.desc.lsn_range.start, + self.desc.lsn_range.end ); if !verbose { @@ -324,10 +305,10 @@ impl Layer for DeltaLayer { reconstruct_state: &mut ValueReconstructState, ctx: &RequestContext, ) -> anyhow::Result { - ensure!(lsn_range.start >= self.lsn_range.start); + ensure!(lsn_range.start >= self.desc.lsn_range.start); let mut need_image = true; - ensure!(self.key_range.contains(&key)); + ensure!(self.desc.key_range.contains(&key)); { // Open the file and lock the metadata in memory @@ -402,19 +383,31 @@ impl Layer for DeltaLayer { Ok(ValueReconstructResult::Complete) } } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn get_key_range(&self) -> Range { + self.layer_desc().key_range.clone() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn get_lsn_range(&self) -> Range { + self.layer_desc().lsn_range.clone() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn is_incremental(&self) -> bool { + self.layer_desc().is_incremental + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn short_id(&self) -> String { + self.layer_desc().short_id() + } } impl PersistentLayer for DeltaLayer { - fn get_tenant_id(&self) -> TenantId { - self.tenant_id - } - - fn get_timeline_id(&self) -> TimelineId { - self.timeline_id - } - - fn filename(&self) -> LayerFileName { - self.layer_name().into() + fn layer_desc(&self) -> &PersistentLayerDesc { + &self.desc } fn local_path(&self) -> Option { @@ -444,10 +437,6 @@ impl PersistentLayer for DeltaLayer { Ok(()) } - fn file_size(&self) -> u64 { - self.file_size - } - fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { let layer_file_name = self.filename().file_name(); let lsn_range = self.get_lsn_range(); @@ -456,7 +445,7 @@ impl PersistentLayer for DeltaLayer { HistoricLayerInfo::Delta { layer_file_name, - layer_file_size: self.file_size, + layer_file_size: self.desc.file_size, lsn_start: lsn_range.start, lsn_end: lsn_range.end, remote: false, @@ -602,11 +591,13 @@ impl DeltaLayer { ) -> DeltaLayer { DeltaLayer { path_or_conf: PathOrConf::Conf(conf), - timeline_id, - tenant_id, - key_range: filename.key_range.clone(), - lsn_range: filename.lsn_range.clone(), - file_size, + desc: PersistentLayerDesc::new_delta( + tenant_id, + timeline_id, + filename.key_range.clone(), + filename.lsn_range.clone(), + file_size, + ), access_stats, inner: RwLock::new(DeltaLayerInner { loaded: false, @@ -619,7 +610,7 @@ impl DeltaLayer { /// Create a DeltaLayer struct representing an existing file on disk. /// - /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. + /// This variant is only used for debugging purposes, by the 'pagectl' binary. pub fn new_for_path(path: &Path, file: File) -> Result { let mut summary_buf = Vec::new(); summary_buf.resize(PAGE_SZ, 0); @@ -632,11 +623,13 @@ impl DeltaLayer { Ok(DeltaLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), - timeline_id: summary.timeline_id, - tenant_id: summary.tenant_id, - key_range: summary.key_range, - lsn_range: summary.lsn_range, - file_size: metadata.len(), + desc: PersistentLayerDesc::new_delta( + summary.tenant_id, + summary.timeline_id, + summary.key_range, + summary.lsn_range, + metadata.len(), + ), access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(DeltaLayerInner { loaded: false, @@ -648,18 +641,14 @@ impl DeltaLayer { } fn layer_name(&self) -> DeltaFileName { - DeltaFileName { - key_range: self.key_range.clone(), - lsn_range: self.lsn_range.clone(), - } + self.desc.delta_file_name() } - /// Path to the layer file in pageserver workdir. pub fn path(&self) -> PathBuf { Self::path_for( &self.path_or_conf, - self.timeline_id, - self.tenant_id, + self.desc.timeline_id, + self.desc.tenant_id, &self.layer_name(), ) } @@ -803,11 +792,13 @@ impl DeltaLayerWriterInner { // set inner.file here. The first read will have to re-open it. let layer = DeltaLayer { path_or_conf: PathOrConf::Conf(self.conf), - tenant_id: self.tenant_id, - timeline_id: self.timeline_id, - key_range: self.key_start..key_end, - lsn_range: self.lsn_range.clone(), - file_size: metadata.len(), + desc: PersistentLayerDesc::new_delta( + self.tenant_id, + self.timeline_id, + self.key_start..key_end, + self.lsn_range.clone(), + metadata.len(), + ), access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(DeltaLayerInner { loaded: false, diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs index e2112fc388..5dcd54689e 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/filename.rs @@ -9,6 +9,8 @@ use std::str::FromStr; use utils::lsn::Lsn; +use super::PersistentLayerDesc; + // Note: Timeline::load_layer_map() relies on this sort order #[derive(PartialEq, Eq, Clone, Hash)] pub struct DeltaFileName { @@ -153,7 +155,7 @@ impl Ord for ImageFileName { impl ImageFileName { pub fn lsn_as_range(&self) -> Range { // Saves from having to copypaste this all over - self.lsn..(self.lsn + 1) + PersistentLayerDesc::image_layer_lsn_range(self.lsn) } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index d298b3e852..07a16a7de2 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -52,8 +52,8 @@ use utils::{ lsn::Lsn, }; -use super::filename::{ImageFileName, LayerFileName}; -use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf}; +use super::filename::ImageFileName; +use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc}; /// /// Header stored in the beginning of the file @@ -84,9 +84,9 @@ impl From<&ImageLayer> for Summary { Self { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenant_id: layer.tenant_id, - timeline_id: layer.timeline_id, - key_range: layer.key_range.clone(), + tenant_id: layer.desc.tenant_id, + timeline_id: layer.desc.timeline_id, + key_range: layer.desc.key_range.clone(), lsn: layer.lsn, index_start_blk: 0, @@ -104,12 +104,9 @@ impl From<&ImageLayer> for Summary { /// and it needs to be loaded before using it in queries. pub struct ImageLayer { path_or_conf: PathOrConf, - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub key_range: Range, - pub file_size: u64, - // This entry contains an image of all pages as of this LSN + pub desc: PersistentLayerDesc, + // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn pub lsn: Lsn, access_stats: LayerAccessStats, @@ -122,8 +119,8 @@ impl std::fmt::Debug for ImageLayer { use super::RangeDisplayDebug; f.debug_struct("ImageLayer") - .field("key_range", &RangeDisplayDebug(&self.key_range)) - .field("file_size", &self.file_size) + .field("key_range", &RangeDisplayDebug(&self.desc.key_range)) + .field("file_size", &self.desc.file_size) .field("lsn", &self.lsn) .field("inner", &self.inner) .finish() @@ -153,27 +150,15 @@ impl std::fmt::Debug for ImageLayerInner { } impl Layer for ImageLayer { - fn get_key_range(&self) -> Range { - self.key_range.clone() - } - - fn get_lsn_range(&self) -> Range { - // End-bound is exclusive - self.lsn..(self.lsn + 1) - } - fn is_incremental(&self) -> bool { - false - } - - fn short_id(&self) -> String { - self.filename().file_name() - } - /// debugging function to print out the contents of the layer fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { println!( "----- image layer for ten {} tli {} key {}-{} at {} ----", - self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn + self.desc.tenant_id, + self.desc.timeline_id, + self.desc.key_range.start, + self.desc.key_range.end, + self.lsn ); if !verbose { @@ -203,7 +188,7 @@ impl Layer for ImageLayer { reconstruct_state: &mut ValueReconstructState, ctx: &RequestContext, ) -> anyhow::Result { - assert!(self.key_range.contains(&key)); + assert!(self.desc.key_range.contains(&key)); assert!(lsn_range.start >= self.lsn); assert!(lsn_range.end >= self.lsn); @@ -230,24 +215,37 @@ impl Layer for ImageLayer { Ok(ValueReconstructResult::Missing) } } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn get_key_range(&self) -> Range { + self.layer_desc().key_range.clone() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn get_lsn_range(&self) -> Range { + self.layer_desc().lsn_range.clone() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn is_incremental(&self) -> bool { + self.layer_desc().is_incremental + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn short_id(&self) -> String { + self.layer_desc().short_id() + } } impl PersistentLayer for ImageLayer { - fn filename(&self) -> LayerFileName { - self.layer_name().into() + fn layer_desc(&self) -> &PersistentLayerDesc { + &self.desc } fn local_path(&self) -> Option { Some(self.path()) } - fn get_tenant_id(&self) -> TenantId { - self.tenant_id - } - - fn get_timeline_id(&self) -> TimelineId { - self.timeline_id - } fn iter(&self, _ctx: &RequestContext) -> Result> { unimplemented!(); } @@ -258,17 +256,13 @@ impl PersistentLayer for ImageLayer { Ok(()) } - fn file_size(&self) -> u64 { - self.file_size - } - fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { let layer_file_name = self.filename().file_name(); let lsn_range = self.get_lsn_range(); HistoricLayerInfo::Image { layer_file_name, - layer_file_size: self.file_size, + layer_file_size: self.desc.file_size, lsn_start: lsn_range.start, remote: false, access_stats: self.access_stats.as_api_model(reset), @@ -405,11 +399,15 @@ impl ImageLayer { ) -> ImageLayer { ImageLayer { path_or_conf: PathOrConf::Conf(conf), - timeline_id, - tenant_id, - key_range: filename.key_range.clone(), + desc: PersistentLayerDesc::new_img( + tenant_id, + timeline_id, + filename.key_range.clone(), + filename.lsn, + false, + file_size, + ), // Now we assume image layer ALWAYS covers the full range. This may change in the future. lsn: filename.lsn, - file_size, access_stats, inner: RwLock::new(ImageLayerInner { loaded: false, @@ -422,7 +420,7 @@ impl ImageLayer { /// Create an ImageLayer struct representing an existing file on disk. /// - /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. + /// This variant is only used for debugging purposes, by the 'pagectl' binary. pub fn new_for_path(path: &Path, file: File) -> Result { let mut summary_buf = Vec::new(); summary_buf.resize(PAGE_SZ, 0); @@ -433,11 +431,15 @@ impl ImageLayer { .context("get file metadata to determine size")?; Ok(ImageLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), - timeline_id: summary.timeline_id, - tenant_id: summary.tenant_id, - key_range: summary.key_range, + desc: PersistentLayerDesc::new_img( + summary.tenant_id, + summary.timeline_id, + summary.key_range, + summary.lsn, + false, + metadata.len(), + ), // Now we assume image layer ALWAYS covers the full range. This may change in the future. lsn: summary.lsn, - file_size: metadata.len(), access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(ImageLayerInner { file: None, @@ -449,18 +451,15 @@ impl ImageLayer { } fn layer_name(&self) -> ImageFileName { - ImageFileName { - key_range: self.key_range.clone(), - lsn: self.lsn, - } + self.desc.image_file_name() } /// Path to the layer file in pageserver workdir. pub fn path(&self) -> PathBuf { Self::path_for( &self.path_or_conf, - self.timeline_id, - self.tenant_id, + self.desc.timeline_id, + self.desc.tenant_id, &self.layer_name(), ) } @@ -484,6 +483,7 @@ struct ImageLayerWriterInner { tenant_id: TenantId, key_range: Range, lsn: Lsn, + is_incremental: bool, blob_writer: WriteBlobWriter, tree: DiskBtreeBuilder, @@ -499,6 +499,7 @@ impl ImageLayerWriterInner { tenant_id: TenantId, key_range: &Range, lsn: Lsn, + is_incremental: bool, ) -> anyhow::Result { // Create the file initially with a temporary filename. // We'll atomically rename it to the final name when we're done. @@ -533,6 +534,7 @@ impl ImageLayerWriterInner { lsn, tree: tree_builder, blob_writer, + is_incremental, }; Ok(writer) @@ -588,16 +590,22 @@ impl ImageLayerWriterInner { .metadata() .context("get metadata to determine file size")?; + let desc = PersistentLayerDesc::new_img( + self.tenant_id, + self.timeline_id, + self.key_range.clone(), + self.lsn, + self.is_incremental, // for now, image layer ALWAYS covers the full range + metadata.len(), + ); + // Note: Because we open the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't // set inner.file here. The first read will have to re-open it. let layer = ImageLayer { path_or_conf: PathOrConf::Conf(self.conf), - timeline_id: self.timeline_id, - tenant_id: self.tenant_id, - key_range: self.key_range.clone(), + desc, lsn: self.lsn, - file_size: metadata.len(), access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(ImageLayerInner { loaded: false, @@ -667,6 +675,7 @@ impl ImageLayerWriter { tenant_id: TenantId, key_range: &Range, lsn: Lsn, + is_incremental: bool, ) -> anyhow::Result { Ok(Self { inner: Some(ImageLayerWriterInner::new( @@ -675,6 +684,7 @@ impl ImageLayerWriter { tenant_id, key_range, lsn, + is_incremental, )?), }) } diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs new file mode 100644 index 0000000000..5ed548909e --- /dev/null +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -0,0 +1,191 @@ +use anyhow::Result; +use std::ops::Range; +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +use crate::{context::RequestContext, repository::Key}; + +use super::{DeltaFileName, ImageFileName, LayerFileName}; + +use serde::{Deserialize, Serialize}; + +/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the +/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides +/// a unified way to generate layer information like file name. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +pub struct PersistentLayerDesc { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub key_range: Range, + /// For image layer, this is `[lsn, lsn+1)`. + pub lsn_range: Range, + /// Whether this is a delta layer. + pub is_delta: bool, + /// Whether this layer only contains page images for part of the keys in the range. In the current implementation, this should + /// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be + /// incremental. + pub is_incremental: bool, + /// File size + pub file_size: u64, +} + +/// A unique identifier of a persistent layer within the context of one timeline. +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub struct PersistentLayerKey { + pub key_range: Range, + pub lsn_range: Range, + pub is_delta: bool, +} + +impl PersistentLayerDesc { + pub fn key(&self) -> PersistentLayerKey { + PersistentLayerKey { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), + is_delta: self.is_delta, + } + } + + pub fn short_id(&self) -> String { + self.filename().file_name() + } + + #[cfg(test)] + pub fn new_test(key_range: Range) -> Self { + Self { + tenant_id: TenantId::generate(), + timeline_id: TimelineId::generate(), + key_range, + lsn_range: Lsn(0)..Lsn(1), + is_delta: false, + is_incremental: false, + file_size: 0, + } + } + + pub fn new_img( + tenant_id: TenantId, + timeline_id: TimelineId, + key_range: Range, + lsn: Lsn, + is_incremental: bool, + file_size: u64, + ) -> Self { + Self { + tenant_id, + timeline_id, + key_range, + lsn_range: Self::image_layer_lsn_range(lsn), + is_delta: false, + is_incremental, + file_size, + } + } + + pub fn new_delta( + tenant_id: TenantId, + timeline_id: TimelineId, + key_range: Range, + lsn_range: Range, + file_size: u64, + ) -> Self { + Self { + tenant_id, + timeline_id, + key_range, + lsn_range, + is_delta: true, + is_incremental: true, + file_size, + } + } + + /// Get the LSN that the image layer covers. + pub fn image_layer_lsn(&self) -> Lsn { + assert!(!self.is_delta); + assert!(self.lsn_range.start + 1 == self.lsn_range.end); + self.lsn_range.start + } + + /// Get the LSN range corresponding to a single image layer LSN. + pub fn image_layer_lsn_range(lsn: Lsn) -> Range { + lsn..(lsn + 1) + } + + /// Get a delta file name for this layer. + /// + /// Panic: if this is not a delta layer. + pub fn delta_file_name(&self) -> DeltaFileName { + assert!(self.is_delta); + DeltaFileName { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), + } + } + + /// Get a delta file name for this layer. + /// + /// Panic: if this is not an image layer, or the lsn range is invalid + pub fn image_file_name(&self) -> ImageFileName { + assert!(!self.is_delta); + assert!(self.lsn_range.start + 1 == self.lsn_range.end); + ImageFileName { + key_range: self.key_range.clone(), + lsn: self.lsn_range.start, + } + } + + pub fn filename(&self) -> LayerFileName { + if self.is_delta { + self.delta_file_name().into() + } else { + self.image_file_name().into() + } + } + + // TODO: remove this in the future once we refactor timeline APIs. + + pub fn get_lsn_range(&self) -> Range { + self.lsn_range.clone() + } + + pub fn get_key_range(&self) -> Range { + self.key_range.clone() + } + + pub fn get_timeline_id(&self) -> TimelineId { + self.timeline_id + } + + pub fn get_tenant_id(&self) -> TenantId { + self.tenant_id + } + + pub fn is_incremental(&self) -> bool { + self.is_incremental + } + + pub fn is_delta(&self) -> bool { + self.is_delta + } + + pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> { + println!( + "----- layer for ten {} tli {} keys {}-{} lsn {}-{} ----", + self.tenant_id, + self.timeline_id, + self.key_range.start, + self.key_range.end, + self.lsn_range.start, + self.lsn_range.end + ); + + Ok(()) + } + + pub fn file_size(&self) -> u64 { + self.file_size + } +} diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 2106587ab2..387bae5b1f 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -18,11 +18,10 @@ use utils::{ lsn::Lsn, }; -use super::filename::{DeltaFileName, ImageFileName, LayerFileName}; -use super::image_layer::ImageLayer; +use super::filename::{DeltaFileName, ImageFileName}; use super::{ - DeltaLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter, - LayerResidenceStatus, PersistentLayer, + DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter, + LayerResidenceStatus, PersistentLayer, PersistentLayerDesc, }; /// RemoteLayer is a not yet downloaded [`ImageLayer`] or @@ -34,19 +33,10 @@ use super::{ /// /// See: [`crate::context::RequestContext`] for authorization to download pub struct RemoteLayer { - tenantid: TenantId, - timelineid: TimelineId, - key_range: Range, - lsn_range: Range, - - pub file_name: LayerFileName, + pub desc: PersistentLayerDesc, pub layer_metadata: LayerFileMetadata, - is_delta: bool, - - is_incremental: bool, - access_stats: LayerAccessStats, pub(crate) ongoing_download: Arc, @@ -66,22 +56,14 @@ pub struct RemoteLayer { impl std::fmt::Debug for RemoteLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("RemoteLayer") - .field("file_name", &self.file_name) + .field("file_name", &self.desc.filename()) .field("layer_metadata", &self.layer_metadata) - .field("is_incremental", &self.is_incremental) + .field("is_incremental", &self.desc.is_incremental) .finish() } } impl Layer for RemoteLayer { - fn get_key_range(&self) -> Range { - self.key_range.clone() - } - - fn get_lsn_range(&self) -> Range { - self.lsn_range.clone() - } - fn get_value_reconstruct_data( &self, _key: Key, @@ -95,53 +77,45 @@ impl Layer for RemoteLayer { ); } - fn is_incremental(&self) -> bool { - self.is_incremental - } - /// debugging function to print out the contents of the layer fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> { println!( "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----", - self.tenantid, - self.timelineid, - self.key_range.start, - self.key_range.end, - self.lsn_range.start, - self.lsn_range.end + self.desc.tenant_id, + self.desc.timeline_id, + self.desc.key_range.start, + self.desc.key_range.end, + self.desc.lsn_range.start, + self.desc.lsn_range.end ); Ok(()) } + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn get_key_range(&self) -> Range { + self.layer_desc().key_range.clone() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn get_lsn_range(&self) -> Range { + self.layer_desc().lsn_range.clone() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn is_incremental(&self) -> bool { + self.layer_desc().is_incremental + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. fn short_id(&self) -> String { - self.filename().file_name() + self.layer_desc().short_id() } } impl PersistentLayer for RemoteLayer { - fn get_tenant_id(&self) -> TenantId { - self.tenantid - } - - fn get_timeline_id(&self) -> TimelineId { - self.timelineid - } - - fn filename(&self) -> LayerFileName { - if self.is_delta { - DeltaFileName { - key_range: self.key_range.clone(), - lsn_range: self.lsn_range.clone(), - } - .into() - } else { - ImageFileName { - key_range: self.key_range.clone(), - lsn: self.lsn_range.start, - } - .into() - } + fn layer_desc(&self) -> &PersistentLayerDesc { + &self.desc } fn local_path(&self) -> Option { @@ -168,15 +142,11 @@ impl PersistentLayer for RemoteLayer { true } - fn file_size(&self) -> u64 { - self.layer_metadata.file_size() - } - fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { let layer_file_name = self.filename().file_name(); let lsn_range = self.get_lsn_range(); - if self.is_delta { + if self.desc.is_delta { HistoricLayerInfo::Delta { layer_file_name, layer_file_size: self.layer_metadata.file_size(), @@ -210,13 +180,14 @@ impl RemoteLayer { access_stats: LayerAccessStats, ) -> RemoteLayer { RemoteLayer { - tenantid, - timelineid, - key_range: fname.key_range.clone(), - lsn_range: fname.lsn_as_range(), - is_delta: false, - is_incremental: false, - file_name: fname.to_owned().into(), + desc: PersistentLayerDesc::new_img( + tenantid, + timelineid, + fname.key_range.clone(), + fname.lsn, + false, + layer_metadata.file_size(), + ), layer_metadata: layer_metadata.clone(), ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), download_replacement_failure: std::sync::atomic::AtomicBool::default(), @@ -232,13 +203,13 @@ impl RemoteLayer { access_stats: LayerAccessStats, ) -> RemoteLayer { RemoteLayer { - tenantid, - timelineid, - key_range: fname.key_range.clone(), - lsn_range: fname.lsn_range.clone(), - is_delta: true, - is_incremental: true, - file_name: fname.to_owned().into(), + desc: PersistentLayerDesc::new_delta( + tenantid, + timelineid, + fname.key_range.clone(), + fname.lsn_range.clone(), + layer_metadata.file_size(), + ), layer_metadata: layer_metadata.clone(), ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), download_replacement_failure: std::sync::atomic::AtomicBool::default(), @@ -256,15 +227,12 @@ impl RemoteLayer { where L: ?Sized + Layer, { - if self.is_delta { - let fname = DeltaFileName { - key_range: self.key_range.clone(), - lsn_range: self.lsn_range.clone(), - }; + if self.desc.is_delta { + let fname = self.desc.delta_file_name(); Arc::new(DeltaLayer::new( conf, - self.timelineid, - self.tenantid, + self.desc.timeline_id, + self.desc.tenant_id, &fname, file_size, self.access_stats.clone_for_residence_change( @@ -273,14 +241,11 @@ impl RemoteLayer { ), )) } else { - let fname = ImageFileName { - key_range: self.key_range.clone(), - lsn: self.lsn_range.start, - }; + let fname = self.desc.image_file_name(); Arc::new(ImageLayer::new( conf, - self.timelineid, - self.tenantid, + self.desc.timeline_id, + self.desc.tenant_id, &fname, file_size, self.access_stats.clone_for_residence_change( diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 6bf26f1da1..360818b5a7 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -9,13 +9,17 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; -use crate::tenant::mgr; use crate::tenant::{Tenant, TenantState}; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::id::TenantId; +use utils::completion; -pub fn start_background_loops(tenant_id: TenantId) { +/// Start per tenant background loops: compaction and gc. +pub fn start_background_loops( + tenant: &Arc, + background_jobs_can_start: Option<&completion::Barrier>, +) { + let tenant_id = tenant.tenant_id; task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, @@ -23,11 +27,20 @@ pub fn start_background_loops(tenant_id: TenantId) { None, &format!("compactor for tenant {tenant_id}"), false, - async move { - compaction_loop(tenant_id) - .instrument(info_span!("compaction_loop", tenant_id = %tenant_id)) - .await; - Ok(()) + { + let tenant = Arc::clone(tenant); + let background_jobs_can_start = background_jobs_can_start.cloned(); + async move { + let cancel = task_mgr::shutdown_token(); + tokio::select! { + _ = cancel.cancelled() => { return Ok(()) }, + _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + }; + compaction_loop(tenant, cancel) + .instrument(info_span!("compaction_loop", tenant_id = %tenant_id)) + .await; + Ok(()) + } }, ); task_mgr::spawn( @@ -37,11 +50,20 @@ pub fn start_background_loops(tenant_id: TenantId) { None, &format!("garbage collector for tenant {tenant_id}"), false, - async move { - gc_loop(tenant_id) - .instrument(info_span!("gc_loop", tenant_id = %tenant_id)) - .await; - Ok(()) + { + let tenant = Arc::clone(tenant); + let background_jobs_can_start = background_jobs_can_start.cloned(); + async move { + let cancel = task_mgr::shutdown_token(); + tokio::select! { + _ = cancel.cancelled() => { return Ok(()) }, + _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + }; + gc_loop(tenant, cancel) + .instrument(info_span!("gc_loop", tenant_id = %tenant_id)) + .await; + Ok(()) + } }, ); } @@ -49,27 +71,26 @@ pub fn start_background_loops(tenant_id: TenantId) { /// /// Compaction task's main loop /// -async fn compaction_loop(tenant_id: TenantId) { +async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { let wait_duration = Duration::from_secs(2); info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { - let cancel = task_mgr::shutdown_token(); let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); let mut first = true; loop { trace!("waking up"); - let tenant = tokio::select! { + tokio::select! { _ = cancel.cancelled() => { info!("received cancellation request"); return; }, - tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { + tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { ControlFlow::Break(()) => return, - ControlFlow::Continue(tenant) => tenant, + ControlFlow::Continue(()) => (), }, - }; + } let period = tenant.get_compaction_period(); @@ -119,29 +140,29 @@ async fn compaction_loop(tenant_id: TenantId) { /// /// GC task's main loop /// -async fn gc_loop(tenant_id: TenantId) { +async fn gc_loop(tenant: Arc, cancel: CancellationToken) { let wait_duration = Duration::from_secs(2); info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { - let cancel = task_mgr::shutdown_token(); // GC might require downloading, to find the cutoff LSN that corresponds to the // cutoff specified as time. - let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + let ctx = + RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); let mut first = true; loop { trace!("waking up"); - let tenant = tokio::select! { + tokio::select! { _ = cancel.cancelled() => { info!("received cancellation request"); return; }, - tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { + tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { ControlFlow::Break(()) => return, - ControlFlow::Continue(tenant) => tenant, + ControlFlow::Continue(()) => (), }, - }; + } let period = tenant.get_gc_period(); @@ -161,7 +182,9 @@ async fn gc_loop(tenant_id: TenantId) { Duration::from_secs(10) } else { // Run gc - let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await; + let res = tenant + .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx) + .await; if let Err(e) = res { error!("Gc failed, retrying in {:?}: {e:?}", wait_duration); wait_duration @@ -187,23 +210,10 @@ async fn gc_loop(tenant_id: TenantId) { trace!("GC loop stopped."); } -async fn wait_for_active_tenant( - tenant_id: TenantId, - wait: Duration, -) -> ControlFlow<(), Arc> { - let tenant = loop { - match mgr::get_tenant(tenant_id, false).await { - Ok(tenant) => break tenant, - Err(e) => { - error!("Failed to get a tenant {tenant_id}: {e:#}"); - tokio::time::sleep(wait).await; - } - } - }; - +async fn wait_for_active_tenant(tenant: &Arc) -> ControlFlow<()> { // if the tenant has a proper status already, no need to wait for anything if tenant.current_state() == TenantState::Active { - ControlFlow::Continue(tenant) + ControlFlow::Continue(()) } else { let mut tenant_state_updates = tenant.subscribe_for_state_updates(); loop { @@ -213,7 +223,7 @@ async fn wait_for_active_tenant( match new_state { TenantState::Active => { debug!("Tenant state changed to active, continuing the task loop"); - return ControlFlow::Continue(tenant); + return ControlFlow::Continue(()); } state => { debug!("Not running the task loop, tenant is not active: {state:?}"); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c47f4444f5..71f83bf127 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -22,8 +22,7 @@ use tracing::*; use utils::id::TenantTimelineId; use std::cmp::{max, min, Ordering}; -use std::collections::BinaryHeap; -use std::collections::HashMap; +use std::collections::{BinaryHeap, HashMap}; use std::fs; use std::ops::{Deref, Range}; use std::path::{Path, PathBuf}; @@ -32,7 +31,6 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; -use crate::broker_client::{get_broker_client, is_broker_client_initialized}; use crate::context::{DownloadBehavior, RequestContext}; use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata}; use crate::tenant::storage_layer::{ @@ -48,7 +46,7 @@ use crate::tenant::{ }; use crate::config::PageServerConf; -use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum}; use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; @@ -59,6 +57,7 @@ use pageserver_api::reltag::RelTag; use postgres_connection::PgConnectionConfig; use postgres_ffi::to_pg_timestamp; use utils::{ + completion, id::{TenantId, TimelineId}, lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, @@ -123,6 +122,17 @@ pub struct Timeline { pub(super) layers: RwLock>, + /// Set of key ranges which should be covered by image layers to + /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored. + /// It is used by compaction task when it checks if new image layer should be created. + /// Newly created image layer doesn't help to remove the delta layer, until the + /// newly created image layer falls off the PITR horizon. So on next GC cycle, + /// gc_timeline may still want the new image layer to be created. To avoid redundant + /// image layers creation we should check if image layer exists but beyond PITR horizon. + /// This is why we need remember GC cutoff LSN. + /// + wanted_image_layers: Mutex>, + last_freeze_at: AtomicLsn, // Atomic would be more appropriate here. last_freeze_ts: RwLock, @@ -186,8 +196,9 @@ pub struct Timeline { /// Layer removal lock. /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], - /// and [`Tenant::delete_timeline`]. - pub(super) layer_removal_cs: tokio::sync::Mutex<()>, + /// and [`Tenant::delete_timeline`]. This is an `Arc` lock because we need an owned + /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`). + pub(super) layer_removal_cs: Arc>, // Needed to ensure that we can't create a branch at a point that was already garbage collected pub latest_gc_cutoff_lsn: Rcu, @@ -217,7 +228,7 @@ pub struct Timeline { /// or None if WAL receiver has not received anything for this timeline /// yet. pub last_received_wal: Mutex>, - pub walreceiver: WalReceiver, + pub walreceiver: Mutex>, /// Relation size cache pub rel_size_cache: RwLock>, @@ -226,7 +237,18 @@ pub struct Timeline { state: watch::Sender, + /// Prevent two tasks from deleting the timeline at the same time. If held, the + /// timeline is being deleted. If 'true', the timeline has already been deleted. + pub delete_lock: Arc>, + eviction_task_timeline_state: tokio::sync::Mutex, + + /// Barrier to wait before doing initial logical size calculation. Used only during startup. + initial_logical_size_can_start: Option, + + /// Completion shared between all timelines loaded during startup; used to delay heavier + /// background tasks until some logical sizes have been calculated. + initial_logical_size_attempt: Mutex>, } /// Internal structure to hold all data needed for logical size calculation. @@ -511,7 +533,12 @@ impl Timeline { Some((cached_lsn, cached_img)) => { match cached_lsn.cmp(&lsn) { Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image + Ordering::Equal => { + self.metrics + .materialized_page_cache_hit_upon_request_counter + .inc(); + return Ok(cached_img); // exact LSN match, return the image + } Ordering::Greater => { unreachable!("the returned lsn should never be after the requested lsn") } @@ -526,8 +553,10 @@ impl Timeline { img: cached_page_img, }; + let timer = self.metrics.get_reconstruct_data_time_histo.start_timer(); self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx) .await?; + timer.stop_and_record(); self.metrics .reconstruct_time_histo @@ -612,17 +641,27 @@ impl Timeline { .await { Ok(()) => Ok(()), - seqwait_error => { + Err(e) => { + // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo drop(_timer); - let walreceiver_status = self.walreceiver.status().await; - seqwait_error.with_context(|| format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, {}", - lsn, - self.get_last_record_lsn(), - self.get_disk_consistent_lsn(), - walreceiver_status.map(|status| status.to_human_readable_string()) - .unwrap_or_else(|| "WalReceiver status: Not active".to_string()), - )) + let walreceiver_status = { + match &*self.walreceiver.lock().unwrap() { + None => "stopping or stopped".to_string(), + Some(walreceiver) => match walreceiver.status() { + Some(status) => status.to_human_readable_string(), + None => "Not active".to_string(), + }, + } + }; + Err(anyhow::Error::new(e).context({ + format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", + lsn, + self.get_last_record_lsn(), + self.get_disk_consistent_lsn(), + walreceiver_status, + ) + })) } } } @@ -650,7 +689,7 @@ impl Timeline { } /// Outermost timeline compaction operation; downloads needed layers. - pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> { + pub async fn compact(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { const ROUNDS: usize = 2; let last_record_lsn = self.get_last_record_lsn(); @@ -739,7 +778,7 @@ impl Timeline { } /// Compaction which might need to be retried after downloading remote layers. - async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> { + async fn compact_inner(self: &Arc, ctx: &RequestContext) -> Result<(), CompactionError> { // // High level strategy for compaction / image creation: // @@ -774,10 +813,9 @@ impl Timeline { // Below are functions compact_level0() and create_image_layers() // but they are a bit ad hoc and don't quite work like it's explained // above. Rewrite it. - let layer_removal_cs = self.layer_removal_cs.lock().await; + let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await); // Is the timeline being deleted? - let state = *self.state.borrow(); - if state == TimelineState::Stopping { + if self.is_stopping() { return Err(anyhow::anyhow!("timeline is Stopping").into()); } @@ -808,7 +846,7 @@ impl Timeline { // 3. Compact let timer = self.metrics.compact_time_histo.start_timer(); - self.compact_level0(&layer_removal_cs, target_file_size, ctx) + self.compact_level0(layer_removal_cs.clone(), target_file_size, ctx) .await?; timer.stop_and_record(); } @@ -897,18 +935,15 @@ impl Timeline { Ok(()) } - pub fn activate(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { - if is_broker_client_initialized() { - self.launch_wal_receiver(ctx, get_broker_client().clone())?; - } else if cfg!(test) { - info!("not launching WAL receiver because broker client hasn't been initialized"); - } else { - anyhow::bail!("broker client not initialized"); - } - + pub fn activate( + self: &Arc, + broker_client: BrokerClientChannel, + background_jobs_can_start: Option<&completion::Barrier>, + ctx: &RequestContext, + ) { + self.launch_wal_receiver(ctx, broker_client); self.set_state(TimelineState::Active); - self.launch_eviction_task(); - Ok(()) + self.launch_eviction_task(background_jobs_can_start); } pub fn set_state(&self, new_state: TimelineState) { @@ -919,26 +954,54 @@ impl Timeline { (st, TimelineState::Loading) => { error!("ignoring transition from {st:?} into Loading state"); } - (TimelineState::Broken, _) => { - error!("Ignoring state update {new_state:?} for broken tenant"); + (TimelineState::Broken { .. }, new_state) => { + error!("Ignoring state update {new_state:?} for broken timeline"); } (TimelineState::Stopping, TimelineState::Active) => { error!("Not activating a Stopping timeline"); } (_, new_state) => { + if matches!( + new_state, + TimelineState::Stopping | TimelineState::Broken { .. } + ) { + // drop the copmletion guard, if any; it might be holding off the completion + // forever needlessly + self.initial_logical_size_attempt + .lock() + .unwrap_or_else(|e| e.into_inner()) + .take(); + } self.state.send_replace(new_state); } } } + pub fn set_broken(&self, reason: String) { + let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture()); + let broken_state = TimelineState::Broken { + reason, + backtrace: backtrace_str, + }; + self.set_state(broken_state) + } + pub fn current_state(&self) -> TimelineState { - *self.state.borrow() + self.state.borrow().clone() + } + + pub fn is_broken(&self) -> bool { + matches!(&*self.state.borrow(), TimelineState::Broken { .. }) } pub fn is_active(&self) -> bool { self.current_state() == TimelineState::Active } + pub fn is_stopping(&self) -> bool { + self.current_state() == TimelineState::Stopping + } + pub fn subscribe_for_state_updates(&self) -> watch::Receiver { self.state.subscribe() } @@ -949,7 +1012,7 @@ impl Timeline { ) -> Result<(), TimelineState> { let mut receiver = self.state.subscribe(); loop { - let current_state = *receiver.borrow_and_update(); + let current_state = receiver.borrow().clone(); match current_state { TimelineState::Loading => { receiver @@ -1167,7 +1230,12 @@ impl Timeline { ), }); - let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? { + let replaced = match batch_updates.replace_historic( + local_layer.layer_desc().clone(), + local_layer, + new_remote_layer.layer_desc().clone(), + new_remote_layer, + )? { Replacement::Replaced { .. } => { if let Err(e) = local_layer.delete_resident_layer_file() { error!("failed to remove layer file on evict after replacement: {e:#?}"); @@ -1275,6 +1343,13 @@ impl Timeline { .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold) } + fn get_gc_feedback(&self) -> bool { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .gc_feedback + .unwrap_or(self.conf.default_tenant_conf.gc_feedback) + } + pub(super) fn tenant_conf_updated(&self) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. @@ -1309,6 +1384,8 @@ impl Timeline { walredo_mgr: Arc, remote_client: Option, pg_version: u32, + initial_logical_size_can_start: Option, + initial_logical_size_attempt: Option, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); let (state, _) = watch::channel(TimelineState::Loading); @@ -1317,15 +1394,7 @@ impl Timeline { let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); let tenant_conf_guard = tenant_conf.read().unwrap(); - let wal_connect_timeout = tenant_conf_guard - .walreceiver_connect_timeout - .unwrap_or(conf.default_tenant_conf.walreceiver_connect_timeout); - let lagging_wal_timeout = tenant_conf_guard - .lagging_wal_timeout - .unwrap_or(conf.default_tenant_conf.lagging_wal_timeout); - let max_lsn_wal_lag = tenant_conf_guard - .max_lsn_wal_lag - .unwrap_or(conf.default_tenant_conf.max_lsn_wal_lag); + let evictions_low_residence_duration_metric_threshold = Self::get_evictions_low_residence_duration_metric_threshold( &tenant_conf_guard, @@ -1334,18 +1403,6 @@ impl Timeline { drop(tenant_conf_guard); Arc::new_cyclic(|myself| { - let walreceiver = WalReceiver::new( - TenantTimelineId::new(tenant_id, timeline_id), - Weak::clone(myself), - WalReceiverConf { - wal_connect_timeout, - lagging_wal_timeout, - max_lsn_wal_lag, - auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), - availability_zone: conf.availability_zone.clone(), - }, - ); - let mut result = Timeline { conf, tenant_conf, @@ -1354,9 +1411,10 @@ impl Timeline { tenant_id, pg_version, layers: RwLock::new(LayerMap::default()), + wanted_image_layers: Mutex::new(None), walredo_mgr, - walreceiver, + walreceiver: Mutex::new(None), remote_client: remote_client.map(Arc::new), @@ -1421,6 +1479,10 @@ impl Timeline { eviction_task_timeline_state: tokio::sync::Mutex::new( EvictionTaskTimelineState::default(), ), + delete_lock: Arc::new(tokio::sync::Mutex::new(false)), + + initial_logical_size_can_start, + initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt), }; result.repartition_threshold = result.get_checkpoint_distance() / 10; result @@ -1476,17 +1538,49 @@ impl Timeline { *flush_loop_state = FlushLoopState::Running; } - pub(super) fn launch_wal_receiver( - &self, + /// Creates and starts the wal receiver. + /// + /// This function is expected to be called at most once per Timeline's lifecycle + /// when the timeline is activated. + fn launch_wal_receiver( + self: &Arc, ctx: &RequestContext, broker_client: BrokerClientChannel, - ) -> anyhow::Result<()> { + ) { info!( "launching WAL receiver for timeline {} of tenant {}", self.timeline_id, self.tenant_id ); - self.walreceiver.start(ctx, broker_client)?; - Ok(()) + + let tenant_conf_guard = self.tenant_conf.read().unwrap(); + let wal_connect_timeout = tenant_conf_guard + .walreceiver_connect_timeout + .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); + let lagging_wal_timeout = tenant_conf_guard + .lagging_wal_timeout + .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); + let max_lsn_wal_lag = tenant_conf_guard + .max_lsn_wal_lag + .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); + drop(tenant_conf_guard); + + let mut guard = self.walreceiver.lock().unwrap(); + assert!( + guard.is_none(), + "multiple launches / re-launches of WAL receiver are not supported" + ); + *guard = Some(WalReceiver::start( + Arc::clone(self), + WalReceiverConf { + wal_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), + availability_zone: self.conf.availability_zone.clone(), + }, + broker_client, + ctx, + )); } /// @@ -1537,7 +1631,7 @@ impl Timeline { trace!("found layer {}", layer.path().display()); total_physical_size += file_size; - updates.insert_historic(Arc::new(layer)); + updates.insert_historic(layer.layer_desc().clone(), Arc::new(layer)); num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { // Create a DeltaLayer struct for each delta file. @@ -1569,7 +1663,7 @@ impl Timeline { trace!("found layer {}", layer.path().display()); total_physical_size += file_size; - updates.insert_historic(Arc::new(layer)); + updates.insert_historic(layer.layer_desc().clone(), Arc::new(layer)); num_layers += 1; } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { // ignore these @@ -1668,7 +1762,7 @@ impl Timeline { anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); } else { self.metrics.resident_physical_size_gauge.sub(local_size); - updates.remove_historic(local_layer); + updates.remove_historic(local_layer.layer_desc().clone(), local_layer); // fall-through to adding the remote layer } } else { @@ -1707,7 +1801,7 @@ impl Timeline { ); let remote_layer = Arc::new(remote_layer); - updates.insert_historic(remote_layer); + updates.insert_historic(remote_layer.layer_desc().clone(), remote_layer); } LayerFileName::Delta(deltafilename) => { // Create a RemoteLayer for the delta file. @@ -1734,7 +1828,7 @@ impl Timeline { ), ); let remote_layer = Arc::new(remote_layer); - updates.insert_historic(remote_layer); + updates.insert_historic(remote_layer.layer_desc().clone(), remote_layer); } } } @@ -1877,9 +1971,30 @@ impl Timeline { false, // NB: don't log errors here, task_mgr will do that. async move { - // no cancellation here, because nothing really waits for this to complete compared + + let cancel = task_mgr::shutdown_token(); + + // in case we were created during pageserver initialization, wait for + // initialization to complete before proceeding. startup time init runs on the same + // runtime. + tokio::select! { + _ = cancel.cancelled() => { return Ok(()); }, + _ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {} + }; + + // hold off background tasks from starting until all timelines get to try at least + // once initial logical size calculation; though retry will rarely be useful. + // holding off is done because heavier tasks execute blockingly on the same + // runtime. + // + // dropping this at every outcome is probably better than trying to cling on to it, + // delay will be terminated by a timeout regardless. + let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() }; + + // no extra cancellation here, because nothing really waits for this to complete compared // to spawn_ondemand_logical_size_calculation. let cancel = CancellationToken::new(); + let calculated_size = match self_clone .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel) .await @@ -2005,11 +2120,11 @@ impl Timeline { loop { match timeline_state_updates.changed().await { Ok(()) => { - let new_state = *timeline_state_updates.borrow(); + let new_state = timeline_state_updates.borrow().clone(); match new_state { // we're running this job for active timelines only TimelineState::Active => continue, - TimelineState::Broken + TimelineState::Broken { .. } | TimelineState::Stopping | TimelineState::Loading => { break format!("aborted because timeline became inactive (new state: {new_state:?})") @@ -2144,7 +2259,7 @@ impl Timeline { fn delete_historic_layer( &self, // we cannot remove layers otherwise, since gc and compaction will race - _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, + _layer_removal_cs: Arc>, layer: Arc, updates: &mut BatchedUpdates<'_, dyn PersistentLayer>, ) -> anyhow::Result<()> { @@ -2161,7 +2276,7 @@ impl Timeline { // won't be needed for page reconstruction for this timeline, // and mark what we can't delete yet as deleted from the layer // map index without actually rebuilding the index. - updates.remove_historic(layer); + updates.remove_historic(layer.layer_desc().clone(), layer); Ok(()) } @@ -2223,6 +2338,9 @@ impl Timeline { let mut timeline_owned; let mut timeline = self; + let mut read_count = + scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64)); + // For debugging purposes, collect the path of layers that we traversed // through. It's included in the error message if we fail to find the key. let mut traversal_path = Vec::::new(); @@ -2357,6 +2475,7 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; + // metrics: open_layer does not count as fs access, so we are not updating `read_count` traversal_path.push(( result, cont_lsn, @@ -2383,6 +2502,7 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; + // metrics: open_layer does not count as fs access, so we are not updating `read_count` traversal_path.push(( result, cont_lsn, @@ -2417,6 +2537,7 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; + *read_count += 1; traversal_path.push(( result, cont_lsn, @@ -2482,7 +2603,7 @@ impl Timeline { (DownloadBehavior::Error, false) => { return Err(PageReconstructError::NeedsDownload( TenantTimelineId::new(self.tenant_id, self.timeline_id), - remote_layer.file_name.clone(), + remote_layer.filename(), )) } } @@ -2608,7 +2729,7 @@ impl Timeline { /// Layer flusher task's main loop. async fn flush_loop( - &self, + self: &Arc, mut layer_flush_start_rx: tokio::sync::watch::Receiver, ctx: &RequestContext, ) { @@ -2697,9 +2818,9 @@ impl Timeline { } /// Flush one frozen in-memory layer to disk, as a new delta layer. - #[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))] async fn flush_frozen_layer( - &self, + self: &Arc, frozen_layer: Arc, ctx: &RequestContext, ) -> anyhow::Result<()> { @@ -2719,7 +2840,16 @@ impl Timeline { .await? } else { // normal case, write out a L0 delta layer file. - let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?; + let this = self.clone(); + let frozen_layer = frozen_layer.clone(); + let span = tracing::info_span!("blocking"); + let (delta_path, metadata) = tokio::task::spawn_blocking(move || { + let _g = span.entered(); + this.create_delta_layer(&frozen_layer) + }) + .await + .context("create_delta_layer spawn_blocking") + .and_then(|res| res)?; HashMap::from([(delta_path, metadata)]) }; @@ -2823,7 +2953,7 @@ impl Timeline { // Write out the given frozen in-memory layer as a new L0 delta file fn create_delta_layer( - &self, + self: &Arc, frozen_layer: &InMemoryLayer, ) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> { // Write it out @@ -2839,10 +2969,13 @@ impl Timeline { // TODO: If we're running inside 'flush_frozen_layers' and there are multiple // files to flush, it might be better to first write them all, and then fsync // them all in parallel. - par_fsync::par_fsync(&[ - new_delta_path.clone(), - self.conf.timeline_path(&self.timeline_id, &self.tenant_id), - ])?; + + // First sync the delta layer. We still use par_fsync here to keep everything consistent. Feel free to replace + // this with a single fsync in future refactors. + par_fsync::par_fsync(&[new_delta_path.clone()]).context("fsync of delta layer")?; + // Then sync the parent directory. + par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)]) + .context("fsync of timeline dir")?; // Add it to the layer map let l = Arc::new(new_delta); @@ -2853,7 +2986,7 @@ impl Timeline { LayerResidenceStatus::Resident, LayerResidenceEventReason::LayerCreate, ); - batch_updates.insert_historic(l); + batch_updates.insert_historic(l.layer_desc().clone(), l); batch_updates.flush(); // update the timeline's physical size @@ -2904,6 +3037,30 @@ impl Timeline { let layers = self.layers.read().unwrap(); let mut max_deltas = 0; + { + let wanted_image_layers = self.wanted_image_layers.lock().unwrap(); + if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers { + let img_range = + partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; + if wanted.overlaps(&img_range) { + // + // gc_timeline only pays attention to image layers that are older than the GC cutoff, + // but create_image_layers creates image layers at last-record-lsn. + // So it's possible that gc_timeline wants a new image layer to be created for a key range, + // but the range is already covered by image layers at more recent LSNs. Before we + // create a new image layer, check if the range is already covered at more recent LSNs. + if !layers + .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))? + { + debug!( + "Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})", + img_range.start, img_range.end, cutoff_lsn, lsn + ); + return Ok(true); + } + } + } + } for part_range in &partition.ranges { let image_coverage = layers.image_coverage(part_range, lsn)?; @@ -2979,6 +3136,7 @@ impl Timeline { self.tenant_id, &img_range, lsn, + false, // image layer always covers the full range )?; fail_point!("image-layer-writer-fail-before-finish", |_| { @@ -3023,6 +3181,12 @@ impl Timeline { image_layers.push(image_layer); } } + // All layers that the GC wanted us to create have now been created. + // + // It's possible that another GC cycle happened while we were compacting, and added + // something new to wanted_image_layers, and we now clear that before processing it. + // That's OK, because the next GC iteration will put it back in. + *self.wanted_image_layers.lock().unwrap() = None; // Sync the new layer to disk before adding it to the layer map, to make sure // we don't garbage collect something based on the new layer, before it has @@ -3036,17 +3200,22 @@ impl Timeline { let all_paths = image_layers .iter() .map(|layer| layer.path()) - .chain(std::iter::once( - self.conf.timeline_path(&self.timeline_id, &self.tenant_id), - )) .collect::>(); - par_fsync::par_fsync(&all_paths).context("fsync of newly created layer files")?; + + par_fsync::par_fsync_async(&all_paths) + .await + .context("fsync of newly created layer files")?; + + par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)]) + .await + .context("fsync of timeline dir")?; let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len()); let mut layers = self.layers.write().unwrap(); let mut updates = layers.batch_update(); let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + for l in image_layers { let path = l.filename(); let metadata = timeline_path @@ -3065,7 +3234,7 @@ impl Timeline { LayerResidenceStatus::Resident, LayerResidenceEventReason::LayerCreate, ); - updates.insert_historic(l); + updates.insert_historic(l.layer_desc().clone(), l); } updates.flush(); drop(layers); @@ -3105,9 +3274,9 @@ impl Timeline { /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the /// start of level0 files compaction, the on-demand download should be revisited as well. - async fn compact_level0_phase1( + fn compact_level0_phase1( &self, - _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, + _layer_removal_cs: Arc>, target_file_size: u64, ctx: &RequestContext, ) -> Result { @@ -3420,13 +3589,13 @@ impl Timeline { if !new_layers.is_empty() { let mut layer_paths: Vec = new_layers.iter().map(|l| l.path()).collect(); - // also sync the directory - layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); - // Fsync all the layer files and directory using multiple threads to // minimize latency. par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?; + par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)]) + .context("fsync of timeline dir")?; + layer_paths.pop().unwrap(); } @@ -3443,17 +3612,26 @@ impl Timeline { /// as Level 1 files. /// async fn compact_level0( - &self, - layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, + self: &Arc, + layer_removal_cs: Arc>, target_file_size: u64, ctx: &RequestContext, ) -> Result<(), CompactionError> { + let this = self.clone(); + let ctx_inner = ctx.clone(); + let layer_removal_cs_inner = layer_removal_cs.clone(); + let span = tracing::info_span!("blocking"); let CompactLevel0Phase1Result { new_layers, deltas_to_compact, - } = self - .compact_level0_phase1(layer_removal_cs, target_file_size, ctx) - .await?; + } = tokio::task::spawn_blocking(move || { + let _g = span.entered(); + this.compact_level0_phase1(layer_removal_cs_inner, target_file_size, &ctx_inner) + }) + .await + .context("compact_level0_phase1 spawn_blocking") + .map_err(CompactionError::Other) + .and_then(|res| res)?; if new_layers.is_empty() && deltas_to_compact.is_empty() { // nothing to do @@ -3503,7 +3681,7 @@ impl Timeline { LayerResidenceStatus::Resident, LayerResidenceEventReason::LayerCreate, ); - updates.insert_historic(x); + updates.insert_historic(x.layer_desc().clone(), x); } // Now that we have reshuffled the data to set of new delta layers, we can @@ -3511,7 +3689,7 @@ impl Timeline { let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len()); for l in deltas_to_compact { layer_names_to_delete.push(l.filename()); - self.delete_historic_layer(layer_removal_cs, l, &mut updates)?; + self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates)?; } updates.flush(); drop(layers); @@ -3631,10 +3809,9 @@ impl Timeline { fail_point!("before-timeline-gc"); - let layer_removal_cs = self.layer_removal_cs.lock().await; + let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await); // Is the timeline being deleted? - let state = *self.state.borrow(); - if state == TimelineState::Stopping { + if self.is_stopping() { anyhow::bail!("timeline is Stopping"); } @@ -3651,7 +3828,7 @@ impl Timeline { let res = self .gc_timeline( - &layer_removal_cs, + layer_removal_cs.clone(), horizon_cutoff, pitr_cutoff, retain_lsns, @@ -3670,7 +3847,7 @@ impl Timeline { async fn gc_timeline( &self, - layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, + layer_removal_cs: Arc>, horizon_cutoff: Lsn, pitr_cutoff: Lsn, retain_lsns: Vec, @@ -3720,6 +3897,7 @@ impl Timeline { } let mut layers_to_remove = Vec::new(); + let mut wanted_image_layers = KeySpaceRandomAccum::default(); // Scan all layers in the timeline (remote or on-disk). // @@ -3803,6 +3981,15 @@ impl Timeline { "keeping {} because it is the latest layer", l.filename().file_name() ); + // Collect delta key ranges that need image layers to allow garbage + // collecting the layers. + // It is not so obvious whether we need to propagate information only about + // delta layers. Image layers can form "stairs" preventing old image from been deleted. + // But image layers are in any case less sparse than delta layers. Also we need some + // protection from replacing recent image layers with new one after each GC iteration. + if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&*l) { + wanted_image_layers.add_range(l.get_key_range()); + } result.layers_not_updated += 1; continue 'outer; } @@ -3815,6 +4002,10 @@ impl Timeline { ); layers_to_remove.push(Arc::clone(&l)); } + self.wanted_image_layers + .lock() + .unwrap() + .replace((new_gc_cutoff, wanted_image_layers.to_keyspace())); let mut updates = layers.batch_update(); if !layers_to_remove.is_empty() { @@ -3829,7 +4020,11 @@ impl Timeline { { for doomed_layer in layers_to_remove { layer_names_to_delete.push(doomed_layer.filename()); - self.delete_historic_layer(layer_removal_cs, doomed_layer, &mut updates)?; // FIXME: schedule succeeded deletions before returning? + self.delete_historic_layer( + layer_removal_cs.clone(), + doomed_layer, + &mut updates, + )?; // FIXME: schedule succeeded deletions before returning? result.layers_removed += 1; } } @@ -4001,7 +4196,7 @@ impl Timeline { // Does retries + exponential back-off internally. // When this fails, don't layer further retry attempts here. let result = remote_client - .download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata) + .download_layer_file(&remote_layer.filename(), &remote_layer.layer_metadata) .await; if let Ok(size) = &result { @@ -4019,7 +4214,7 @@ impl Timeline { { use crate::tenant::layer_map::Replacement; let l: Arc = remote_layer.clone(); - let failure = match updates.replace_historic(&l, new_layer) { + let failure = match updates.replace_historic(l.layer_desc().clone(), &l, new_layer.layer_desc().clone(), new_layer) { Ok(Replacement::Replaced { .. }) => false, Ok(Replacement::NotFound) => { // TODO: the downloaded file should probably be removed, otherwise diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 558600692e..1040dff63d 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -34,6 +34,8 @@ use crate::{ }, }; +use utils::completion; + use super::Timeline; #[derive(Default)] @@ -47,8 +49,12 @@ pub struct EvictionTaskTenantState { } impl Timeline { - pub(super) fn launch_eviction_task(self: &Arc) { + pub(super) fn launch_eviction_task( + self: &Arc, + background_tasks_can_start: Option<&completion::Barrier>, + ) { let self_clone = Arc::clone(self); + let background_tasks_can_start = background_tasks_can_start.cloned(); task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Eviction, @@ -57,7 +63,13 @@ impl Timeline { &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id), false, async move { - self_clone.eviction_task(task_mgr::shutdown_token()).await; + let cancel = task_mgr::shutdown_token(); + tokio::select! { + _ = cancel.cancelled() => { return Ok(()); } + _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {} + }; + + self_clone.eviction_task(cancel).await; info!("eviction task finishing"); Ok(()) }, diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 91f7208194..ccff735c3c 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -25,20 +25,19 @@ mod walreceiver_connection; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME}; +use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::timeline::walreceiver::connection_manager::{ connection_manager_loop_step, ConnectionManagerState, }; -use anyhow::Context; use std::future::Future; use std::num::NonZeroU64; use std::ops::ControlFlow; -use std::sync::atomic::{self, AtomicBool}; -use std::sync::{Arc, Weak}; +use std::sync::Arc; use std::time::Duration; use storage_broker::BrokerClientChannel; use tokio::select; -use tokio::sync::{watch, RwLock}; +use tokio::sync::watch; use tokio_util::sync::CancellationToken; use tracing::*; @@ -62,46 +61,23 @@ pub struct WalReceiverConf { pub struct WalReceiver { timeline: TenantTimelineId, - timeline_ref: Weak, - conf: WalReceiverConf, - started: AtomicBool, - manager_status: Arc>>, + manager_status: Arc>>, } impl WalReceiver { - pub fn new( - timeline: TenantTimelineId, - timeline_ref: Weak, - conf: WalReceiverConf, - ) -> Self { - Self { - timeline, - timeline_ref, - conf, - started: AtomicBool::new(false), - manager_status: Arc::new(RwLock::new(None)), - } - } - pub fn start( - &self, - ctx: &RequestContext, + timeline: Arc, + conf: WalReceiverConf, mut broker_client: BrokerClientChannel, - ) -> anyhow::Result<()> { - if self.started.load(atomic::Ordering::Acquire) { - anyhow::bail!("Wal receiver is already started"); - } - - let timeline = self.timeline_ref.upgrade().with_context(|| { - format!("walreceiver start on a dropped timeline {}", self.timeline) - })?; - + ctx: &RequestContext, + ) -> Self { let tenant_id = timeline.tenant_id; let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); - let wal_receiver_conf = self.conf.clone(); - let loop_status = Arc::clone(&self.manager_status); + + let loop_status = Arc::new(std::sync::RwLock::new(None)); + let manager_status = Arc::clone(&loop_status); task_mgr::spawn( WALRECEIVER_RUNTIME.handle(), TaskKind::WalReceiverManager, @@ -110,15 +86,16 @@ impl WalReceiver { &format!("walreceiver for timeline {tenant_id}/{timeline_id}"), false, async move { - info!("WAL receiver manager started, connecting to broker"); + debug_assert_current_span_has_tenant_and_timeline_id(); + debug!("WAL receiver manager started, connecting to broker"); let mut connection_manager_state = ConnectionManagerState::new( timeline, - wal_receiver_conf, + conf, ); loop { select! { _ = task_mgr::shutdown_watcher() => { - info!("WAL receiver shutdown requested, shutting down"); + trace!("WAL receiver shutdown requested, shutting down"); break; }, loop_step_result = connection_manager_loop_step( @@ -129,7 +106,7 @@ impl WalReceiver { ) => match loop_step_result { ControlFlow::Continue(()) => continue, ControlFlow::Break(()) => { - info!("Connection manager loop ended, shutting down"); + trace!("Connection manager loop ended, shutting down"); break; } }, @@ -137,29 +114,29 @@ impl WalReceiver { } connection_manager_state.shutdown().await; - *loop_status.write().await = None; + *loop_status.write().unwrap() = None; Ok(()) } - .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id)) + .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id)) ); - self.started.store(true, atomic::Ordering::Release); - - Ok(()) + Self { + timeline: TenantTimelineId::new(tenant_id, timeline_id), + manager_status, + } } - pub async fn stop(&self) { + pub async fn stop(self) { task_mgr::shutdown_tasks( Some(TaskKind::WalReceiverManager), Some(self.timeline.tenant_id), Some(self.timeline.timeline_id), ) .await; - self.started.store(false, atomic::Ordering::Release); } - pub(super) async fn status(&self) -> Option { - self.manager_status.read().await.clone() + pub(super) fn status(&self) -> Option { + self.manager_status.read().unwrap().clone() } } @@ -223,29 +200,19 @@ impl TaskHandle { TaskEvent::End(match self.join_handle.as_mut() { Some(jh) => { if !jh.is_finished() { - // Barring any implementation errors in this module, we can - // only arrive here while the task that executes the future - // passed to `Self::spawn()` is still execution. Cf the comment - // in Self::spawn(). - // - // This was logging at warning level in earlier versions, presumably - // to leave some breadcrumbs in case we had an implementation - // error that would would make us get stuck in `jh.await`. - // - // There hasn't been such a bug so far. - // But in a busy system, e.g., during pageserver restart, - // we arrive here often enough that the warning-level logs - // became a distraction. - // So, tone them down to info-level. - // - // XXX: rewrite this module to eliminate the race condition. - info!("sender is dropped while join handle is still alive"); + // See: https://github.com/neondatabase/neon/issues/2885 + trace!("sender is dropped while join handle is still alive"); } - let res = jh - .await - .map_err(|e| anyhow::anyhow!("Failed to join task: {e}")) - .and_then(|x| x); + let res = match jh.await { + Ok(res) => res, + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { + // already logged + Ok(()) + } + Err(je) => Err(anyhow::Error::new(je).context("join walreceiver task")), + }; // For cancellation-safety, drop join_handle only after successful .await. self.join_handle = None; @@ -268,12 +235,12 @@ impl TaskHandle { match jh.await { Ok(Ok(())) => debug!("Shutdown success"), Ok(Err(e)) => error!("Shutdown task error: {e:?}"), - Err(join_error) => { - if join_error.is_cancelled() { - error!("Shutdown task was cancelled"); - } else { - error!("Shutdown task join error: {join_error}") - } + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { + // already logged + } + Err(je) => { + error!("Shutdown task join error: {je}") } } } diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 2305844d75..a5d0af32fe 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -18,7 +18,7 @@ use crate::metrics::{ WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES, }; use crate::task_mgr::TaskKind; -use crate::tenant::Timeline; +use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline}; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use pageserver_api::models::TimelineState; @@ -29,7 +29,6 @@ use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use storage_broker::BrokerClientChannel; use storage_broker::Streaming; use tokio::select; -use tokio::sync::RwLock; use tracing::*; use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; @@ -48,7 +47,7 @@ pub(super) async fn connection_manager_loop_step( broker_client: &mut BrokerClientChannel, connection_manager_state: &mut ConnectionManagerState, ctx: &RequestContext, - manager_status: &RwLock>, + manager_status: &std::sync::RwLock>, ) -> ControlFlow<(), ()> { match connection_manager_state .timeline @@ -56,8 +55,11 @@ pub(super) async fn connection_manager_loop_step( .await { Ok(()) => {} - Err(_) => { - info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop"); + Err(new_state) => { + debug!( + ?new_state, + "state changed, stopping wal connection manager loop" + ); return ControlFlow::Break(()); } } @@ -80,7 +82,7 @@ pub(super) async fn connection_manager_loop_step( // with other streams on this client (other connection managers). When // object goes out of scope, stream finishes in drop() automatically. let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await; - info!("Subscribed for broker timeline updates"); + debug!("Subscribed for broker timeline updates"); loop { let time_until_next_retry = connection_manager_state.time_until_next_retry(); @@ -151,13 +153,13 @@ pub(super) async fn connection_manager_loop_step( match new_state { // we're already active as walreceiver, no need to reactivate TimelineState::Active => continue, - TimelineState::Broken | TimelineState::Stopping => { - info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop"); + TimelineState::Broken { .. } | TimelineState::Stopping => { + debug!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop"); return ControlFlow::Break(()); } TimelineState::Loading => { warn!("timeline transitioned back to Loading state, that should not happen"); - return ControlFlow::Continue(new_state); + return ControlFlow::Continue(()); } } } @@ -165,12 +167,11 @@ pub(super) async fn connection_manager_loop_step( } } } => match new_event { - ControlFlow::Continue(new_state) => { - info!("observed timeline state change, new state is {new_state:?}"); + ControlFlow::Continue(()) => { return ControlFlow::Continue(()); } ControlFlow::Break(()) => { - info!("Timeline dropped state updates sender, stopping wal connection manager loop"); + debug!("Timeline is no longer active, stopping wal connection manager loop"); return ControlFlow::Break(()); } }, @@ -195,7 +196,7 @@ pub(super) async fn connection_manager_loop_step( .change_connection(new_candidate, ctx) .await } - *manager_status.write().await = Some(connection_manager_state.manager_status()); + *manager_status.write().unwrap() = Some(connection_manager_state.manager_status()); } } @@ -391,7 +392,6 @@ impl ConnectionManagerState { self.drop_old_connection(true).await; - let id = self.id; let node_id = new_sk.safekeeper_id; let connect_timeout = self.conf.wal_connect_timeout; let timeline = Arc::clone(&self.timeline); @@ -399,9 +399,13 @@ impl ConnectionManagerState { TaskKind::WalReceiverConnectionHandler, DownloadBehavior::Download, ); + + let span = info_span!("connection", %node_id); let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { async move { - super::walreceiver_connection::handle_walreceiver_connection( + debug_assert_current_span_has_tenant_and_timeline_id(); + + let res = super::walreceiver_connection::handle_walreceiver_connection( timeline, new_sk.wal_source_connconf, events_sender, @@ -410,12 +414,23 @@ impl ConnectionManagerState { ctx, node_id, ) - .await - .context("walreceiver connection handling failure") + .await; + + match res { + Ok(()) => Ok(()), + Err(e) => { + use super::walreceiver_connection::ExpectedError; + if e.is_expected() { + info!("walreceiver connection handling ended: {e:#}"); + Ok(()) + } else { + // give out an error to have task_mgr give it a really verbose logging + Err(e).context("walreceiver connection handling failure") + } + } + } } - .instrument( - info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id), - ) + .instrument(span) }); let now = Utc::now().naive_utc(); @@ -1309,9 +1324,8 @@ mod tests { async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState { let (tenant, ctx) = harness.load().await; let timeline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx) + .create_test_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx) .expect("Failed to create an empty timeline for dummy wal connection manager"); - let timeline = timeline.initialize(&ctx).unwrap(); ConnectionManagerState { id: TenantTimelineId { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 1cbed3416c..a16afe2b3c 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -21,16 +21,16 @@ use postgres_types::PgLsn; use tokio::{select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, trace, warn}; +use tracing::{debug, error, info, trace, warn, Instrument}; use super::TaskStateUpdate; -use crate::metrics::LIVE_CONNECTIONS_COUNT; -use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS}; use crate::{ + context::RequestContext, + metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS}, task_mgr, task_mgr::TaskKind, task_mgr::WALRECEIVER_RUNTIME, - tenant::{Timeline, WalReceiverInfo}, + tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, walingest::WalIngest, walrecord::DecodedWALRecord, }; @@ -81,13 +81,8 @@ pub(super) async fn handle_walreceiver_connection( config.application_name("pageserver"); config.replication_mode(tokio_postgres::config::ReplicationMode::Physical); match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await { - Ok(Ok(client_and_conn)) => client_and_conn, - Ok(Err(conn_err)) => { - let expected_error = ignore_expected_errors(conn_err)?; - info!("DB connection stream finished: {expected_error}"); - return Ok(()); - } - Err(_) => { + Ok(client_and_conn) => client_and_conn?, + Err(_elapsed) => { // Timing out to connect to a safekeeper node could happen long time, due to // many reasons that pageserver cannot control. // Do not produce an error, but make it visible, that timeouts happen by logging the `event. @@ -97,7 +92,7 @@ pub(super) async fn handle_walreceiver_connection( } }; - info!("connected!"); + debug!("connected!"); let mut connection_status = WalConnectionStatus { is_connected: true, has_processed_wal: false, @@ -127,20 +122,25 @@ pub(super) async fn handle_walreceiver_connection( "walreceiver connection", false, async move { + debug_assert_current_span_has_tenant_and_timeline_id(); + select! { connection_result = connection => match connection_result { - Ok(()) => info!("Walreceiver db connection closed"), + Ok(()) => debug!("Walreceiver db connection closed"), Err(connection_error) => { - if let Err(e) = ignore_expected_errors(connection_error) { - warn!("Connection aborted: {e:#}") + if connection_error.is_expected() { + // silence, because most likely we've already exited the outer call + // with a similar error. + } else { + warn!("Connection aborted: {connection_error:#}") } } }, - // Future: replace connection_cancellation with connection_ctx cancellation - _ = connection_cancellation.cancelled() => info!("Connection cancelled"), + _ = connection_cancellation.cancelled() => debug!("Connection cancelled"), } Ok(()) - }, + } + .instrument(tracing::info_span!("poller")), ); // Immediately increment the gauge, then create a job to decrement it on task exit. @@ -203,20 +203,13 @@ pub(super) async fn handle_walreceiver_connection( while let Some(replication_message) = { select! { _ = cancellation.cancelled() => { - info!("walreceiver interrupted"); + debug!("walreceiver interrupted"); None } replication_message = physical_stream.next() => replication_message, } } { - let replication_message = match replication_message { - Ok(message) => message, - Err(replication_error) => { - let expected_error = ignore_expected_errors(replication_error)?; - info!("Replication stream finished: {expected_error}"); - return Ok(()); - } - }; + let replication_message = replication_message?; let now = Utc::now().naive_utc(); let last_rec_lsn_before_msg = last_rec_lsn; @@ -261,8 +254,6 @@ pub(super) async fn handle_walreceiver_connection( let mut decoded = DecodedWALRecord::default(); let mut modification = timeline.begin_modification(endlsn); while let Some((lsn, recdata)) = waldecoder.poll_decode()? { - // let _enter = info_span!("processing record", lsn = %lsn).entered(); - // It is important to deal with the aligned records as lsn in getPage@LSN is // aligned and can be several bytes bigger. Without this alignment we are // at risk of hitting a deadlock. @@ -421,31 +412,50 @@ async fn identify_system(client: &mut Client) -> anyhow::Result } } -/// We don't want to report connectivity problems as real errors towards connection manager because -/// 1. they happen frequently enough to make server logs hard to read and -/// 2. the connection manager can retry other safekeeper. -/// -/// If this function returns `Ok(pg_error)`, it's such an error. -/// The caller should log it at info level and then report to connection manager that we're done handling this connection. -/// Connection manager will then handle reconnections. -/// -/// If this function returns an `Err()`, the caller can bubble it up using `?`. -/// The connection manager will log the error at ERROR level. -fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result { - if pg_error.is_closed() - || pg_error - .source() - .and_then(|source| source.downcast_ref::()) - .map(is_expected_io_error) - .unwrap_or(false) - { - return Ok(pg_error); - } else if let Some(db_error) = pg_error.as_db_error() { - if db_error.code() == &SqlState::SUCCESSFUL_COMPLETION - && db_error.message().contains("ending streaming") - { - return Ok(pg_error); - } - } - Err(pg_error).context("connection error") +/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors. +pub(super) trait ExpectedError { + /// Test if this error is an ok error. + /// + /// We don't want to report connectivity problems as real errors towards connection manager because + /// 1. they happen frequently enough to make server logs hard to read and + /// 2. the connection manager can retry other safekeeper. + /// + /// If this function returns `true`, it's such an error. + /// The caller should log it at info level and then report to connection manager that we're done handling this connection. + /// Connection manager will then handle reconnections. + /// + /// If this function returns an `false` the error should be propagated and the connection manager + /// will log the error at ERROR level. + fn is_expected(&self) -> bool; +} + +impl ExpectedError for postgres::Error { + fn is_expected(&self) -> bool { + self.is_closed() + || self + .source() + .and_then(|source| source.downcast_ref::()) + .map(is_expected_io_error) + .unwrap_or(false) + || self + .as_db_error() + .filter(|db_error| { + db_error.code() == &SqlState::SUCCESSFUL_COMPLETION + && db_error.message().contains("ending streaming") + }) + .is_some() + } +} + +impl ExpectedError for anyhow::Error { + fn is_expected(&self) -> bool { + let head = self.downcast_ref::(); + + let tail = self + .chain() + .filter_map(|e| e.downcast_ref::()); + + // check if self or any of the chained/sourced errors are expected + head.into_iter().chain(tail).any(|e| e.is_expected()) + } } diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 8f5faff627..a62cc99adf 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -76,6 +76,12 @@ pub(crate) struct UploadQueueInitialized { pub(crate) queued_operations: VecDeque, } +impl UploadQueueInitialized { + pub(super) fn no_pending_work(&self) -> bool { + self.inprogress_tasks.is_empty() && self.queued_operations.is_empty() + } +} + #[derive(Clone, Copy)] pub(super) enum SetDeletedFlagProgress { NotRunning, @@ -84,9 +90,7 @@ pub(super) enum SetDeletedFlagProgress { } pub(super) struct UploadQueueStopped { - pub(super) latest_files: HashMap, - pub(super) last_uploaded_consistent_lsn: Lsn, - pub(super) latest_metadata: TimelineMetadata, + pub(super) upload_queue_for_deletion: UploadQueueInitialized, pub(super) deleted_at: SetDeletedFlagProgress, } @@ -187,6 +191,15 @@ impl UploadQueue { UploadQueue::Initialized(x) => Ok(x), } } + + pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> { + match self { + UploadQueue::Initialized(_) | UploadQueue::Uninitialized => { + anyhow::bail!("queue is in state {}", self.as_str()) + } + UploadQueue::Stopped(stopped) => Ok(stopped), + } + } } /// An in-progress upload or delete task. @@ -199,6 +212,13 @@ pub(crate) struct UploadTask { pub(crate) op: UploadOp, } +#[derive(Debug)] +pub(crate) struct Delete { + pub(crate) file_kind: RemoteOpFileKind, + pub(crate) layer_file_name: LayerFileName, + pub(crate) scheduled_from_timeline_delete: bool, +} + #[derive(Debug)] pub(crate) enum UploadOp { /// Upload a layer file @@ -207,8 +227,8 @@ pub(crate) enum UploadOp { /// Upload the metadata file UploadMetadata(IndexPart, Lsn), - /// Delete a file. - Delete(RemoteOpFileKind, LayerFileName), + /// Delete a layer file + Delete(Delete), /// Barrier. When the barrier operation is reached, Barrier(tokio::sync::watch::Sender<()>), @@ -226,7 +246,12 @@ impl std::fmt::Display for UploadOp { ) } UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn), - UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()), + UploadOp::Delete(delete) => write!( + f, + "Delete(path: {}, scheduled_from_timeline_delete: {})", + delete.layer_file_name.file_name(), + delete.scheduled_from_timeline_delete + ), UploadOp::Barrier(_) => write!(f, "Barrier"), } } diff --git a/pgxn/hnsw/Makefile b/pgxn/hnsw/Makefile new file mode 100644 index 0000000000..66436b5920 --- /dev/null +++ b/pgxn/hnsw/Makefile @@ -0,0 +1,26 @@ +EXTENSION = hnsw +EXTVERSION = 0.1.0 + +MODULE_big = hnsw +DATA = $(wildcard *--*.sql) +OBJS = hnsw.o hnswalg.o + +TESTS = $(wildcard test/sql/*.sql) +REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) +REGRESS_OPTS = --inputdir=test --load-extension=hnsw + +# For auto-vectorization: +# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html +PG_CFLAGS += -O3 +PG_CXXFLAGS += -O3 -std=c++11 +PG_LDFLAGS += -lstdc++ + +all: $(EXTENSION)--$(EXTVERSION).sql + +PG_CONFIG ?= pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) + +dist: + mkdir -p dist + git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master diff --git a/pgxn/hnsw/README.md b/pgxn/hnsw/README.md new file mode 100644 index 0000000000..bc9c8d571c --- /dev/null +++ b/pgxn/hnsw/README.md @@ -0,0 +1,25 @@ +# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors + +This ANN extension of Postgres is based +on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw), +the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper: + +[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html), +
+Dmitry Baranchuk, Artem Babenko, Yury Malkov + +# Postgres extension + +HNSW index is hold in memory (built on demand) and it's maxial size is limited +by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type). +Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters +described in the article). + +# Example of usage: + +``` +create extension hnsw; +create table embeddings(id integer primary key, payload real[]); +create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32); +select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100; +``` \ No newline at end of file diff --git a/pgxn/hnsw/hnsw--0.1.0.sql b/pgxn/hnsw/hnsw--0.1.0.sql new file mode 100644 index 0000000000..ebf424326d --- /dev/null +++ b/pgxn/hnsw/hnsw--0.1.0.sql @@ -0,0 +1,29 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION hnsw" to load this file. \quit + +-- functions + +CREATE FUNCTION l2_distance(real[], real[]) RETURNS real + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- operators + +CREATE OPERATOR <-> ( + LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance, + COMMUTATOR = '<->' +); + +-- access method + +CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler; + +COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method'; + +-- opclasses + +CREATE OPERATOR CLASS knn_ops + DEFAULT FOR TYPE real[] USING hnsw AS + OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops; diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c new file mode 100644 index 0000000000..434f4986f8 --- /dev/null +++ b/pgxn/hnsw/hnsw.c @@ -0,0 +1,551 @@ +#include "postgres.h" + +#include "access/amapi.h" +#include "access/generic_xlog.h" +#include "access/relation.h" +#include "access/reloptions.h" +#include "access/tableam.h" +#include "catalog/index.h" +#include "commands/vacuum.h" +#include "nodes/execnodes.h" +#include "storage/bufmgr.h" +#include "utils/guc.h" +#include "utils/selfuncs.h" + +#include +#include + +#include "hnsw.h" + +PG_MODULE_MAGIC; + +typedef struct { + int32 vl_len_; /* varlena header (do not touch directly!) */ + int dims; + int maxelements; + int efConstruction; + int efSearch; + int M; +} HnswOptions; + +static relopt_kind hnsw_relopt_kind; + +typedef struct { + HierarchicalNSW* hnsw; + size_t curr; + size_t n_results; + ItemPointer results; +} HnswScanOpaqueData; + +typedef HnswScanOpaqueData* HnswScanOpaque; + +typedef struct { + Oid relid; + uint32 status; + HierarchicalNSW* hnsw; +} HnswHashEntry; + + +#define SH_PREFIX hnsw_index +#define SH_ELEMENT_TYPE HnswHashEntry +#define SH_KEY_TYPE Oid +#define SH_KEY relid +#define SH_STORE_HASH +#define SH_GET_HASH(tb, a) ((a)->relid) +#define SH_HASH_KEY(tb, key) (key) +#define SH_EQUAL(tb, a, b) ((a) == (b)) +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + +#define INDEX_HASH_SIZE 11 + +#define DEFAULT_EF_SEARCH 64 + +PGDLLEXPORT void _PG_init(void); + +static hnsw_index_hash *hnsw_indexes; + +/* + * Initialize index options and variables + */ +void +_PG_init(void) +{ + hnsw_relopt_kind = add_reloption_kind(); + add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions", + 0, 0, INT_MAX, AccessExclusiveLock); + add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements", + 0, 0, INT_MAX, AccessExclusiveLock); + add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex", + 100, 0, INT_MAX, AccessExclusiveLock); + add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction", + 16, 1, INT_MAX, AccessExclusiveLock); + add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search", + 64, 1, INT_MAX, AccessExclusiveLock); + hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL); +} + + +static void +hnsw_build_callback(Relation index, ItemPointer tid, Datum *values, + bool *isnull, bool tupleIsAlive, void *state) +{ + HierarchicalNSW* hnsw = (HierarchicalNSW*) state; + ArrayType* array; + int n_items; + label_t label = 0; + + /* Skip nulls */ + if (isnull[0]) + return; + + array = DatumGetArrayTypeP(values[0]); + n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); + if (n_items != hnsw_dimensions(hnsw)) + { + elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", + n_items, hnsw_dimensions(hnsw)); + } + + memcpy(&label, tid, sizeof(*tid)); + hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label); +} + +static void +hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel) +{ + IndexInfo* indexInfo = BuildIndexInfo(indexRel); + Assert(indexInfo->ii_NumIndexAttrs == 1); + table_index_build_scan(heapRel, indexRel, indexInfo, + true, true, hnsw_build_callback, (void *) hnsw, NULL); +} + +static HierarchicalNSW* +hnsw_get_index(Relation indexRel, Relation heapRel) +{ + HierarchicalNSW* hnsw; + Oid indexoid = RelationGetRelid(indexRel); + HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid); + if (entry == NULL) + { + size_t dims, maxelements; + size_t M; + size_t maxM; + size_t size_links_level0; + size_t size_data_per_element; + size_t data_size; + dsm_handle handle = indexoid << 1; /* make it even */ + void* impl_private = NULL; + void* mapped_address = NULL; + Size mapped_size = 0; + Size shmem_size; + bool exists = true; + bool found; + HnswOptions *opts = (HnswOptions *) indexRel->rd_options; + if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) { + elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified"); + } + dims = opts->dims; + maxelements = opts->maxelements; + M = opts->M; + maxM = M * 2; + data_size = dims * sizeof(coord_t); + size_links_level0 = (maxM + 1) * sizeof(idx_t); + size_data_per_element = size_links_level0 + data_size + sizeof(label_t); + shmem_size = hnsw_sizeof() + maxelements * size_data_per_element; + + /* first try to attach to existed index */ + if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private, + &mapped_address, &mapped_size, DEBUG1)) + { + /* index doesn't exists: try to create it */ + if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private, + &mapped_address, &mapped_size, DEBUG1)) + { + /* We can do it under shared lock, so some other backend may + * try to initialize index. If create is failed because index already + * created by somebody else, then try to attach to it once again + */ + if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private, + &mapped_address, &mapped_size, ERROR)) + { + return NULL; + } + } + else + { + exists = false; + } + } + Assert(mapped_size == shmem_size); + hnsw = (HierarchicalNSW*)mapped_address; + + if (!exists) + { + hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction); + hnsw_populate(hnsw, indexRel, heapRel); + } + entry = hnsw_index_insert(hnsw_indexes, indexoid, &found); + Assert(!found); + entry->hnsw = hnsw; + } + else + { + hnsw = entry->hnsw; + } + return hnsw; +} + +/* + * Start or restart an index scan + */ +static IndexScanDesc +hnsw_beginscan(Relation index, int nkeys, int norderbys) +{ + IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys); + HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData)); + Relation heap = relation_open(index->rd_index->indrelid, NoLock); + so->hnsw = hnsw_get_index(index, heap); + relation_close(heap, NoLock); + so->curr = 0; + so->n_results = 0; + so->results = NULL; + scan->opaque = so; + return scan; +} + +/* + * Start or restart an index scan + */ +static void +hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys) +{ + HnswScanOpaque so = (HnswScanOpaque) scan->opaque; + if (so->results) + { + pfree(so->results); + so->results = NULL; + } + so->curr = 0; + if (orderbys && scan->numberOfOrderBys > 0) + memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData)); +} + +/* + * Fetch the next tuple in the given scan + */ +static bool +hnsw_gettuple(IndexScanDesc scan, ScanDirection dir) +{ + HnswScanOpaque so = (HnswScanOpaque) scan->opaque; + + /* + * Index can be used to scan backward, but Postgres doesn't support + * backward scan on operators + */ + Assert(ScanDirectionIsForward(dir)); + + if (so->curr == 0) + { + Datum value; + ArrayType* array; + int n_items; + size_t n_results; + label_t* results; + HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options; + size_t efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH; + + /* Safety check */ + if (scan->orderByData == NULL) + elog(ERROR, "cannot scan HNSW index without order"); + + /* No items will match if null */ + if (scan->orderByData->sk_flags & SK_ISNULL) + return false; + + value = scan->orderByData->sk_argument; + array = DatumGetArrayTypeP(value); + n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); + if (n_items != hnsw_dimensions(so->hnsw)) + { + elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", + n_items, hnsw_dimensions(so->hnsw)); + } + + if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results)) + elog(ERROR, "HNSW index search failed"); + so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData)); + so->n_results = n_results; + for (size_t i = 0; i < n_results; i++) + { + memcpy(&so->results[i], &results[i], sizeof(so->results[i])); + } + free(results); + } + if (so->curr >= so->n_results) + { + return false; + } + else + { + scan->xs_heaptid = so->results[so->curr++]; + scan->xs_recheckorderby = false; + return true; + } +} + +/* + * End a scan and release resources + */ +static void +hnsw_endscan(IndexScanDesc scan) +{ + HnswScanOpaque so = (HnswScanOpaque) scan->opaque; + if (so->results) + pfree(so->results); + pfree(so); + scan->opaque = NULL; +} + + +/* + * Estimate the cost of an index scan + */ +static void +hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count, + Cost *indexStartupCost, Cost *indexTotalCost, + Selectivity *indexSelectivity, double *indexCorrelation + ,double *indexPages +) +{ + GenericCosts costs; + + /* Never use index without order */ + if (path->indexorderbys == NULL) + { + *indexStartupCost = DBL_MAX; + *indexTotalCost = DBL_MAX; + *indexSelectivity = 0; + *indexCorrelation = 0; + *indexPages = 0; + return; + } + + MemSet(&costs, 0, sizeof(costs)); + + genericcostestimate(root, path, loop_count, &costs); + + /* Startup cost and total cost are same */ + *indexStartupCost = costs.indexTotalCost; + *indexTotalCost = costs.indexTotalCost; + *indexSelectivity = costs.indexSelectivity; + *indexCorrelation = costs.indexCorrelation; + *indexPages = costs.numIndexPages; +} + +/* + * Parse and validate the reloptions + */ +static bytea * +hnsw_options(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)}, + {"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)}, + {"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)}, + {"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)}, + {"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)} + }; + + return (bytea *) build_reloptions(reloptions, validate, + hnsw_relopt_kind, + sizeof(HnswOptions), + tab, lengthof(tab)); +} + +/* + * Validate catalog entries for the specified operator class + */ +static bool +hnsw_validate(Oid opclassoid) +{ + return true; +} + +/* + * Build the index for a logged table + */ +static IndexBuildResult * +hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo) +{ + HierarchicalNSW* hnsw = hnsw_get_index(index, heap); + IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + result->heap_tuples = result->index_tuples = hnsw_count(hnsw); + + return result; +} + +/* + * Insert a tuple into the index + */ +static bool +hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid, + Relation heap, IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + HierarchicalNSW* hnsw = hnsw_get_index(index, heap); + Datum value; + ArrayType* array; + int n_items; + label_t label = 0; + + /* Skip nulls */ + if (isnull[0]) + return false; + + /* Detoast value */ + value = PointerGetDatum(PG_DETOAST_DATUM(values[0])); + array = DatumGetArrayTypeP(value); + n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); + if (n_items != hnsw_dimensions(hnsw)) + { + elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", + n_items, hnsw_dimensions(hnsw)); + } + memcpy(&label, heap_tid, sizeof(*heap_tid)); + if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label)) + elog(ERROR, "HNSW index insert failed"); + return true; +} + +/* + * Build the index for an unlogged table + */ +static void +hnsw_buildempty(Relation index) +{ + /* index will be constructed on dema nd when accessed */ +} + +/* + * Clean up after a VACUUM operation + */ +static IndexBulkDeleteResult * +hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + Relation rel = info->index; + + if (stats == NULL) + return NULL; + + stats->num_pages = RelationGetNumberOfBlocks(rel); + + return stats; +} + +/* + * Bulk delete tuples from the index + */ +static IndexBulkDeleteResult * +hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + return stats; +} + +/* + * Define index handler + * + * See https://www.postgresql.org/docs/current/index-api.html + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler); +Datum +hnsw_handler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = 0; + amroutine->amsupport = 0; + amroutine->amoptsprocnum = 0; + amroutine->amcanorder = false; + amroutine->amcanorderbyop = true; + amroutine->amcanbackward = false; /* can change direction mid-scan */ + amroutine->amcanunique = false; + amroutine->amcanmulticol = false; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = false; + amroutine->amstorage = false; + amroutine->amclusterable = false; + amroutine->ampredlocks = false; + amroutine->amcanparallel = false; + amroutine->amcaninclude = false; + amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */ + amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL; + amroutine->amkeytype = InvalidOid; + + /* Interface functions */ + amroutine->ambuild = hnsw_build; + amroutine->ambuildempty = hnsw_buildempty; + amroutine->aminsert = hnsw_insert; + amroutine->ambulkdelete = hnsw_bulkdelete; + amroutine->amvacuumcleanup = hnsw_vacuumcleanup; + amroutine->amcanreturn = NULL; /* tuple not included in heapsort */ + amroutine->amcostestimate = hnsw_costestimate; + amroutine->amoptions = hnsw_options; + amroutine->amproperty = NULL; /* TODO AMPROP_DISTANCE_ORDERABLE */ + amroutine->ambuildphasename = NULL; + amroutine->amvalidate = hnsw_validate; + amroutine->amadjustmembers = NULL; + amroutine->ambeginscan = hnsw_beginscan; + amroutine->amrescan = hnsw_rescan; + amroutine->amgettuple = hnsw_gettuple; + amroutine->amgetbitmap = NULL; + amroutine->amendscan = hnsw_endscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + + /* Interface functions to support parallel index scans */ + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + PG_RETURN_POINTER(amroutine); +} + +/* + * Get the L2 distance between vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance); +Datum +l2_distance(PG_FUNCTION_ARGS) +{ + ArrayType *a = PG_GETARG_ARRAYTYPE_P(0); + ArrayType *b = PG_GETARG_ARRAYTYPE_P(1); + int a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a)); + int b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b)); + dist_t distance = 0.0; + dist_t diff; + coord_t *ax = (coord_t*)ARR_DATA_PTR(a); + coord_t *bx = (coord_t*)ARR_DATA_PTR(b); + + if (a_dim != b_dim) + { + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("different array dimensions %d and %d", a_dim, b_dim))); + } + + for (int i = 0; i < a_dim; i++) + { + diff = ax[i] - bx[i]; + distance += diff * diff; + } + + PG_RETURN_FLOAT4((dist_t)sqrt(distance)); +} diff --git a/pgxn/hnsw/hnsw.control b/pgxn/hnsw/hnsw.control new file mode 100644 index 0000000000..b292b96026 --- /dev/null +++ b/pgxn/hnsw/hnsw.control @@ -0,0 +1,5 @@ +comment = 'hNsw index' +default_version = '0.1.0' +module_pathname = '$libdir/hnsw' +relocatable = true +trusted = true diff --git a/pgxn/hnsw/hnsw.h b/pgxn/hnsw/hnsw.h new file mode 100644 index 0000000000..d4065ab8fe --- /dev/null +++ b/pgxn/hnsw/hnsw.h @@ -0,0 +1,15 @@ +#pragma once + +typedef float coord_t; +typedef float dist_t; +typedef uint32_t idx_t; +typedef uint64_t label_t; + +typedef struct HierarchicalNSW HierarchicalNSW; + +bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results); +bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label); +void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction); +int hnsw_dimensions(HierarchicalNSW* hnsw); +size_t hnsw_count(HierarchicalNSW* hnsw); +size_t hnsw_sizeof(void); diff --git a/pgxn/hnsw/hnswalg.cpp b/pgxn/hnsw/hnswalg.cpp new file mode 100644 index 0000000000..f6de3b8314 --- /dev/null +++ b/pgxn/hnsw/hnswalg.cpp @@ -0,0 +1,379 @@ +#include "hnswalg.h" + +#if defined(__GNUC__) +#define PORTABLE_ALIGN32 __attribute__((aligned(32))) +#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint) +#else +#define PORTABLE_ALIGN32 __declspec(align(32)) +#define PREFETCH(addr,hint) +#endif + +HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_) +{ + dim = dim_; + data_size = dim * sizeof(coord_t); + + efConstruction = efConstruction_; + + maxelements = maxelements_; + M = M_; + maxM = maxM_; + size_links_level0 = (maxM + 1) * sizeof(idx_t); + size_data_per_element = size_links_level0 + data_size + sizeof(label_t); + offset_data = size_links_level0; + offset_label = offset_data + data_size; + + enterpoint_node = 0; + cur_element_count = 0; +#ifdef __x86_64__ + use_avx2 = __builtin_cpu_supports("avx2"); +#endif +} + +std::priority_queue> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef) +{ + std::vector visited; + visited.resize((cur_element_count + 31) >> 5); + + std::priority_queue> topResults; + std::priority_queue> candidateSet; + + dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node)); + + topResults.emplace(dist, enterpoint_node); + candidateSet.emplace(-dist, enterpoint_node); + visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31); + dist_t lowerBound = dist; + + while (!candidateSet.empty()) + { + std::pair curr_el_pair = candidateSet.top(); + if (-curr_el_pair.first > lowerBound) + break; + + candidateSet.pop(); + idx_t curNodeNum = curr_el_pair.second; + + idx_t* data = get_linklist0(curNodeNum); + size_t size = *data++; + + PREFETCH(getDataByInternalId(*data), 0); + + for (size_t j = 0; j < size; ++j) { + size_t tnum = *(data + j); + + PREFETCH(getDataByInternalId(*(data + j + 1)), 0); + + if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) { + visited[tnum >> 5] |= 1 << (tnum & 31); + + dist = fstdistfunc(point, getDataByInternalId(tnum)); + + if (topResults.top().first > dist || topResults.size() < ef) { + candidateSet.emplace(-dist, tnum); + + PREFETCH(get_linklist0(candidateSet.top().second), 0); + topResults.emplace(dist, tnum); + + if (topResults.size() > ef) + topResults.pop(); + + lowerBound = topResults.top().first; + } + } + } + } + return topResults; +} + + +void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue> &topResults, size_t NN) +{ + if (topResults.size() < NN) + return; + + std::priority_queue> resultSet; + std::vector> returnlist; + + while (topResults.size() > 0) { + resultSet.emplace(-topResults.top().first, topResults.top().second); + topResults.pop(); + } + + while (resultSet.size()) { + if (returnlist.size() >= NN) + break; + std::pair curen = resultSet.top(); + dist_t dist_to_query = -curen.first; + resultSet.pop(); + bool good = true; + for (std::pair curen2 : returnlist) { + dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second), + getDataByInternalId(curen.second)); + if (curdist < dist_to_query) { + good = false; + break; + } + } + if (good) returnlist.push_back(curen); + } + for (std::pair elem : returnlist) + topResults.emplace(-elem.first, elem.second); +} + +void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c, + std::priority_queue> topResults) +{ + getNeighborsByHeuristic(topResults, M); + + std::vector res; + res.reserve(M); + while (topResults.size() > 0) { + res.push_back(topResults.top().second); + topResults.pop(); + } + { + idx_t* data = get_linklist0(cur_c); + if (*data) + throw std::runtime_error("Should be blank"); + + *data++ = res.size(); + + for (size_t idx = 0; idx < res.size(); idx++) { + if (data[idx]) + throw std::runtime_error("Should be blank"); + data[idx] = res[idx]; + } + } + for (size_t idx = 0; idx < res.size(); idx++) { + if (res[idx] == cur_c) + throw std::runtime_error("Connection to the same element"); + + size_t resMmax = maxM; + idx_t *ll_other = get_linklist0(res[idx]); + idx_t sz_link_list_other = *ll_other; + + if (sz_link_list_other > resMmax || sz_link_list_other < 0) + throw std::runtime_error("Bad sz_link_list_other"); + + if (sz_link_list_other < resMmax) { + idx_t *data = ll_other + 1; + data[sz_link_list_other] = cur_c; + *ll_other = sz_link_list_other + 1; + } else { + // finding the "weakest" element to replace it with the new one + idx_t *data = ll_other + 1; + dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx])); + // Heuristic: + std::priority_queue> candidates; + candidates.emplace(d_max, cur_c); + + for (size_t j = 0; j < sz_link_list_other; j++) + candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]); + + getNeighborsByHeuristic(candidates, resMmax); + + size_t indx = 0; + while (!candidates.empty()) { + data[indx] = candidates.top().second; + candidates.pop(); + indx++; + } + *ll_other = indx; + } + } +} + +void HierarchicalNSW::addPoint(const coord_t *point, label_t label) +{ + if (cur_element_count >= maxelements) { + throw std::runtime_error("The number of elements exceeds the specified limit"); + } + idx_t cur_c = cur_element_count++; + memset((char *) get_linklist0(cur_c), 0, size_data_per_element); + memcpy(getDataByInternalId(cur_c), point, data_size); + memcpy(getExternalLabel(cur_c), &label, sizeof label); + + // Do nothing for the first element + if (cur_c != 0) { + std::priority_queue > topResults = searchBaseLayer(point, efConstruction); + mutuallyConnectNewElement(point, cur_c, topResults); + } +}; + +std::priority_queue> HierarchicalNSW::searchKnn(const coord_t *query, size_t k) +{ + std::priority_queue> topResults; + auto topCandidates = searchBaseLayer(query, k); + while (topCandidates.size() > k) { + topCandidates.pop(); + } + while (!topCandidates.empty()) { + std::pair rez = topCandidates.top(); + label_t label; + memcpy(&label, getExternalLabel(rez.second), sizeof(label)); + topResults.push(std::pair(rez.first, label)); + topCandidates.pop(); + } + + return topResults; +}; + +dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n) +{ + dist_t distance = 0.0; + + for (size_t i = 0; i < n; i++) + { + dist_t diff = x[i] - y[i]; + distance += diff * diff; + } + return distance; + +} + +#ifdef __x86_64__ +#include + +__attribute__((target("avx2"))) +dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n) +{ + const size_t TmpResSz = sizeof(__m256) / sizeof(float); + float PORTABLE_ALIGN32 TmpRes[TmpResSz]; + size_t qty16 = n / 16; + const float *pEnd1 = x + (qty16 * 16); + __m256 diff, v1, v2; + __m256 sum = _mm256_set1_ps(0); + + while (x < pEnd1) { + v1 = _mm256_loadu_ps(x); + x += 8; + v2 = _mm256_loadu_ps(y); + y += 8; + diff = _mm256_sub_ps(v1, v2); + sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); + + v1 = _mm256_loadu_ps(x); + x += 8; + v2 = _mm256_loadu_ps(y); + y += 8; + diff = _mm256_sub_ps(v1, v2); + sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); + } + _mm256_store_ps(TmpRes, sum); + float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; + return (res); +} + +dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n) +{ + const size_t TmpResSz = sizeof(__m128) / sizeof(float); + float PORTABLE_ALIGN32 TmpRes[TmpResSz]; + size_t qty16 = n / 16; + const float *pEnd1 = x + (qty16 * 16); + + __m128 diff, v1, v2; + __m128 sum = _mm_set1_ps(0); + + while (x < pEnd1) { + v1 = _mm_loadu_ps(x); + x += 4; + v2 = _mm_loadu_ps(y); + y += 4; + diff = _mm_sub_ps(v1, v2); + sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); + + v1 = _mm_loadu_ps(x); + x += 4; + v2 = _mm_loadu_ps(y); + y += 4; + diff = _mm_sub_ps(v1, v2); + sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); + + v1 = _mm_loadu_ps(x); + x += 4; + v2 = _mm_loadu_ps(y); + y += 4; + diff = _mm_sub_ps(v1, v2); + sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); + + v1 = _mm_loadu_ps(x); + x += 4; + v2 = _mm_loadu_ps(y); + y += 4; + diff = _mm_sub_ps(v1, v2); + sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); + } + _mm_store_ps(TmpRes, sum); + float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + return res; +} +#endif + +dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y) +{ +#ifndef __x86_64__ + return fstdistfunc_scalar(x, y, dim); +#else + if(use_avx2) + return fstdistfunc_avx2(x, y, dim); + + return fstdistfunc_sse(x, y, dim); +#endif +} + +bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results) +{ + try + { + auto result = hnsw->searchKnn(point, efSearch); + size_t nResults = result.size(); + *results = (label_t*)malloc(nResults*sizeof(label_t)); + for (size_t i = nResults; i-- != 0;) + { + (*results)[i] = result.top().second; + result.pop(); + } + *n_results = nResults; + return true; + } + catch (std::exception& x) + { + return false; + } +} + +bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label) +{ + try + { + hnsw->addPoint(point, label); + return true; + } + catch (std::exception& x) + { + fprintf(stderr, "Catch %s\n", x.what()); + return false; + } +} + +void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction) +{ + new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction); +} + + +int hnsw_dimensions(HierarchicalNSW* hnsw) +{ + return (int)hnsw->dim; +} + +size_t hnsw_count(HierarchicalNSW* hnsw) +{ + return hnsw->cur_element_count; +} + +size_t hnsw_sizeof(void) +{ + return sizeof(HierarchicalNSW); +} diff --git a/pgxn/hnsw/hnswalg.h b/pgxn/hnsw/hnswalg.h new file mode 100644 index 0000000000..f38aeac362 --- /dev/null +++ b/pgxn/hnsw/hnswalg.h @@ -0,0 +1,69 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern "C" { +#include "hnsw.h" +} + +struct HierarchicalNSW +{ + size_t maxelements; + size_t cur_element_count; + + idx_t enterpoint_node; + + size_t dim; + size_t data_size; + size_t offset_data; + size_t offset_label; + size_t size_data_per_element; + size_t M; + size_t maxM; + size_t size_links_level0; + size_t efConstruction; + +#ifdef __x86_64__ + bool use_avx2; +#endif + + char data_level0_memory[0]; // varying size + + public: + HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction); + ~HierarchicalNSW(); + + + inline coord_t *getDataByInternalId(idx_t internal_id) const { + return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data]; + } + + inline idx_t *get_linklist0(idx_t internal_id) const { + return (idx_t*)&data_level0_memory[internal_id * size_data_per_element]; + } + + inline label_t *getExternalLabel(idx_t internal_id) const { + return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label]; + } + + std::priority_queue> searchBaseLayer(const coord_t *x, size_t ef); + + void getNeighborsByHeuristic(std::priority_queue> &topResults, size_t NN); + + void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue> topResults); + + void addPoint(const coord_t *point, label_t label); + + std::priority_queue> searchKnn(const coord_t *query_data, size_t k); + + dist_t fstdistfunc(const coord_t *x, const coord_t *y); +}; diff --git a/pgxn/hnsw/test/expected/knn.out b/pgxn/hnsw/test/expected/knn.out new file mode 100644 index 0000000000..a1cee4525e --- /dev/null +++ b/pgxn/hnsw/test/expected/knn.out @@ -0,0 +1,28 @@ +SET enable_seqscan = off; +CREATE TABLE t (val real[]); +INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL); +CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3); +INSERT INTO t (val) VALUES (array[1,2,4]); +explain SELECT * FROM t ORDER BY val <-> array[3,3,3]; + QUERY PLAN +-------------------------------------------------------------------- + Index Scan using t_val_idx on t (cost=4.02..8.06 rows=3 width=36) + Order By: (val <-> '{3,3,3}'::real[]) +(2 rows) + +SELECT * FROM t ORDER BY val <-> array[3,3,3]; + val +--------- + {1,2,3} + {1,2,4} + {1,1,1} + {0,0,0} +(4 rows) + +SELECT COUNT(*) FROM t; + count +------- + 5 +(1 row) + +DROP TABLE t; diff --git a/pgxn/hnsw/test/sql/knn.sql b/pgxn/hnsw/test/sql/knn.sql new file mode 100644 index 0000000000..0635bda4a2 --- /dev/null +++ b/pgxn/hnsw/test/sql/knn.sql @@ -0,0 +1,13 @@ +SET enable_seqscan = off; + +CREATE TABLE t (val real[]); +INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL); +CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3); + +INSERT INTO t (val) VALUES (array[1,2,4]); + +explain SELECT * FROM t ORDER BY val <-> array[3,3,3]; +SELECT * FROM t ORDER BY val <-> array[3,3,3]; +SELECT COUNT(*) FROM t; + +DROP TABLE t; diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index ec377dbb1e..1948023472 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -11,10 +11,12 @@ OBJS = \ pagestore_smgr.o \ relsize_cache.o \ walproposer.o \ - walproposer_utils.o + walproposer_utils.o \ + control_plane_connector.o PG_CPPFLAGS = -I$(libpq_srcdir) SHLIB_LINK_INTERNAL = $(libpq) +SHLIB_LINK = -lcurl EXTENSION = neon DATA = neon--1.0.sql diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c new file mode 100644 index 0000000000..82e4af4b4a --- /dev/null +++ b/pgxn/neon/control_plane_connector.c @@ -0,0 +1,830 @@ +/*------------------------------------------------------------------------- + * + * control_plane_connector.c + * Captures updates to roles/databases using ProcessUtility_hook and + * sends them to the control ProcessUtility_hook. The changes are sent + * via HTTP to the URL specified by the GUC neon.console_url when the + * transaction commits. Forwarding may be disabled temporarily by + * setting neon.forward_ddl to false. + * + * Currently, the transaction may abort AFTER + * changes have already been forwarded, and that case is not handled. + * Subtransactions are handled using a stack of hash tables, which + * accumulate changes. On subtransaction commit, the top of the stack + * is merged with the table below it. + * + * IDENTIFICATION + * contrib/neon/control_plane_connector.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "tcop/pquery.h" +#include "tcop/utility.h" +#include "access/xact.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" +#include "commands/defrem.h" +#include "miscadmin.h" +#include "utils/acl.h" +#include "fmgr.h" +#include "utils/guc.h" +#include "port.h" +#include +#include "utils/jsonb.h" + +static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; + +/* GUCs */ +static char *ConsoleURL = NULL; +static bool ForwardDDL = true; + +/* Curl structures for sending the HTTP requests */ +static CURL * CurlHandle; +static struct curl_slist *ContentHeader = NULL; + +/* + * CURL docs say that this buffer must exist until we call curl_easy_cleanup + * (which we never do), so we make this a static + */ +static char CurlErrorBuf[CURL_ERROR_SIZE]; + +typedef enum +{ + Op_Set, /* An upsert: Either a creation or an alter */ + Op_Delete, +} OpType; + +typedef struct +{ + char name[NAMEDATALEN]; + Oid owner; + char old_name[NAMEDATALEN]; + OpType type; +} DbEntry; + +typedef struct +{ + char name[NAMEDATALEN]; + char old_name[NAMEDATALEN]; + const char *password; + OpType type; +} RoleEntry; + +/* + * We keep one of these for each subtransaction in a stack. When a subtransaction + * commits, we merge the top of the stack into the table below it. It is allocated in the + * subtransaction's context. + */ +typedef struct DdlHashTable +{ + struct DdlHashTable *prev_table; + HTAB *db_table; + HTAB *role_table; +} DdlHashTable; + +static DdlHashTable RootTable; +static DdlHashTable * CurrentDdlTable = &RootTable; + +static void +PushKeyValue(JsonbParseState **state, char *key, char *value) +{ + JsonbValue k, + v; + + k.type = jbvString; + k.val.string.len = strlen(key); + k.val.string.val = key; + v.type = jbvString; + v.val.string.len = strlen(value); + v.val.string.val = value; + pushJsonbValue(state, WJB_KEY, &k); + pushJsonbValue(state, WJB_VALUE, &v); +} + +static char * +ConstructDeltaMessage() +{ + JsonbParseState *state = NULL; + + pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); + if (RootTable.db_table) + { + JsonbValue dbs; + + dbs.type = jbvString; + dbs.val.string.val = "dbs"; + dbs.val.string.len = strlen(dbs.val.string.val); + pushJsonbValue(&state, WJB_KEY, &dbs); + pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL); + + HASH_SEQ_STATUS status; + DbEntry *entry; + + hash_seq_init(&status, RootTable.db_table); + while ((entry = hash_seq_search(&status)) != NULL) + { + pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); + PushKeyValue(&state, "op", entry->type == Op_Set ? "set" : "del"); + PushKeyValue(&state, "name", entry->name); + if (entry->owner != InvalidOid) + { + PushKeyValue(&state, "owner", GetUserNameFromId(entry->owner, false)); + } + if (entry->old_name[0] != '\0') + { + PushKeyValue(&state, "old_name", entry->old_name); + } + pushJsonbValue(&state, WJB_END_OBJECT, NULL); + } + pushJsonbValue(&state, WJB_END_ARRAY, NULL); + } + + if (RootTable.role_table) + { + JsonbValue roles; + + roles.type = jbvString; + roles.val.string.val = "roles"; + roles.val.string.len = strlen(roles.val.string.val); + pushJsonbValue(&state, WJB_KEY, &roles); + pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL); + + HASH_SEQ_STATUS status; + RoleEntry *entry; + + hash_seq_init(&status, RootTable.role_table); + while ((entry = hash_seq_search(&status)) != NULL) + { + pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); + PushKeyValue(&state, "op", entry->type == Op_Set ? "set" : "del"); + PushKeyValue(&state, "name", entry->name); + if (entry->password) + { + PushKeyValue(&state, "password", (char *) entry->password); + } + if (entry->old_name[0] != '\0') + { + PushKeyValue(&state, "old_name", entry->old_name); + } + pushJsonbValue(&state, WJB_END_OBJECT, NULL); + } + pushJsonbValue(&state, WJB_END_ARRAY, NULL); + } + JsonbValue *result = pushJsonbValue(&state, WJB_END_OBJECT, NULL); + Jsonb *jsonb = JsonbValueToJsonb(result); + + return JsonbToCString(NULL, &jsonb->root, 0 /* estimated_len */ ); +} + +#define ERROR_SIZE 1024 + +typedef struct +{ + char str[ERROR_SIZE]; + size_t size; +} ErrorString; + +static size_t +ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata) +{ + /* Docs say size is always 1 */ + ErrorString *str = userdata; + + size_t to_write = nmemb; + + /* +1 for null terminator */ + if (str->size + nmemb + 1 >= ERROR_SIZE) + to_write = ERROR_SIZE - str->size - 1; + + /* Ignore everyrthing past the first ERROR_SIZE bytes */ + if (to_write == 0) + return nmemb; + memcpy(str->str + str->size, ptr, to_write); + str->size += to_write; + str->str[str->size] = '\0'; + return nmemb; +} + +static void +SendDeltasToControlPlane() +{ + if (!RootTable.db_table && !RootTable.role_table) + return; + if (!ConsoleURL) + { + elog(LOG, "ConsoleURL not set, skipping forwarding"); + return; + } + if (!ForwardDDL) + return; + + char *message = ConstructDeltaMessage(); + ErrorString str = {}; + + curl_easy_setopt(CurlHandle, CURLOPT_CUSTOMREQUEST, "PATCH"); + curl_easy_setopt(CurlHandle, CURLOPT_HTTPHEADER, ContentHeader); + curl_easy_setopt(CurlHandle, CURLOPT_POSTFIELDS, message); + curl_easy_setopt(CurlHandle, CURLOPT_URL, ConsoleURL); + curl_easy_setopt(CurlHandle, CURLOPT_ERRORBUFFER, CurlErrorBuf); + curl_easy_setopt(CurlHandle, CURLOPT_TIMEOUT, 3L /* seconds */ ); + curl_easy_setopt(CurlHandle, CURLOPT_WRITEDATA, &str); + curl_easy_setopt(CurlHandle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback); + + const int num_retries = 5; + int curl_status; + + for (int i = 0; i < num_retries; i++) + { + if ((curl_status = curl_easy_perform(CurlHandle)) == 0) + break; + elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf); + pg_usleep(1000 * 1000); + } + if (curl_status != 0) + { + elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf); + } + else + { + long response_code; + + if (curl_easy_getinfo(CurlHandle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION) + { + bool error_exists = str.size != 0; + + if (response_code != 200) + { + if (error_exists) + { + elog(ERROR, + "Received HTTP code %ld from control plane: %s", + response_code, + str.str); + } + else + { + elog(ERROR, + "Received HTTP code %ld from control plane", + response_code); + } + } + } + } +} + +static void +InitDbTableIfNeeded() +{ + if (!CurrentDdlTable->db_table) + { + HASHCTL db_ctl = {}; + + db_ctl.keysize = NAMEDATALEN; + db_ctl.entrysize = sizeof(DbEntry); + db_ctl.hcxt = CurTransactionContext; + CurrentDdlTable->db_table = hash_create( + "Dbs Created", + 4, + &db_ctl, + HASH_ELEM | HASH_STRINGS | HASH_CONTEXT); + } +} + +static void +InitRoleTableIfNeeded() +{ + if (!CurrentDdlTable->role_table) + { + HASHCTL role_ctl = {}; + + role_ctl.keysize = NAMEDATALEN; + role_ctl.entrysize = sizeof(RoleEntry); + role_ctl.hcxt = CurTransactionContext; + CurrentDdlTable->role_table = hash_create( + "Roles Created", + 4, + &role_ctl, + HASH_ELEM | HASH_STRINGS | HASH_CONTEXT); + } +} + +static void +PushTable() +{ + DdlHashTable *new_table = MemoryContextAlloc(CurTransactionContext, sizeof(DdlHashTable)); + + new_table->prev_table = CurrentDdlTable; + new_table->role_table = NULL; + new_table->db_table = NULL; + CurrentDdlTable = new_table; +} + +static void +MergeTable() +{ + DdlHashTable *old_table = CurrentDdlTable; + + CurrentDdlTable = old_table->prev_table; + + if (old_table->db_table) + { + InitDbTableIfNeeded(); + DbEntry *entry; + HASH_SEQ_STATUS status; + + hash_seq_init(&status, old_table->db_table); + while ((entry = hash_seq_search(&status)) != NULL) + { + DbEntry *to_write = hash_search( + CurrentDdlTable->db_table, + entry->name, + HASH_ENTER, + NULL); + + to_write->type = entry->type; + if (entry->owner != InvalidOid) + to_write->owner = entry->owner; + strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); + if (entry->old_name[0] != '\0') + { + bool found_old = false; + DbEntry *old = hash_search( + CurrentDdlTable->db_table, + entry->old_name, + HASH_FIND, + &found_old); + + if (found_old) + { + if (old->old_name[0] != '\0') + strlcpy(to_write->old_name, old->old_name, NAMEDATALEN); + else + strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); + hash_search( + CurrentDdlTable->db_table, + entry->old_name, + HASH_REMOVE, + NULL); + } + } + } + hash_destroy(old_table->db_table); + } + + if (old_table->role_table) + { + InitRoleTableIfNeeded(); + RoleEntry *entry; + HASH_SEQ_STATUS status; + + hash_seq_init(&status, old_table->role_table); + while ((entry = hash_seq_search(&status)) != NULL) + { + RoleEntry *to_write = hash_search( + CurrentDdlTable->role_table, + entry->name, + HASH_ENTER, + NULL); + + to_write->type = entry->type; + if (entry->password) + to_write->password = entry->password; + strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); + if (entry->old_name[0] != '\0') + { + bool found_old = false; + RoleEntry *old = hash_search( + CurrentDdlTable->role_table, + entry->old_name, + HASH_FIND, + &found_old); + + if (found_old) + { + if (old->old_name[0] != '\0') + strlcpy(to_write->old_name, old->old_name, NAMEDATALEN); + else + strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); + hash_search(CurrentDdlTable->role_table, + entry->old_name, + HASH_REMOVE, + NULL); + } + } + } + hash_destroy(old_table->role_table); + } +} + +static void +PopTable() +{ + /* + * Current table gets freed because it is allocated in aborted + * subtransaction's memory context. + */ + CurrentDdlTable = CurrentDdlTable->prev_table; +} + +static void +NeonSubXactCallback( + SubXactEvent event, + SubTransactionId mySubid, + SubTransactionId parentSubid, + void *arg) +{ + switch (event) + { + case SUBXACT_EVENT_START_SUB: + return PushTable(); + case SUBXACT_EVENT_COMMIT_SUB: + return MergeTable(); + case SUBXACT_EVENT_ABORT_SUB: + return PopTable(); + default: + return; + } +} + +static void +NeonXactCallback(XactEvent event, void *arg) +{ + if (event == XACT_EVENT_PRE_COMMIT || event == XACT_EVENT_PARALLEL_PRE_COMMIT) + { + SendDeltasToControlPlane(); + } + RootTable.role_table = NULL; + RootTable.db_table = NULL; + Assert(CurrentDdlTable == &RootTable); +} + +static void +HandleCreateDb(CreatedbStmt *stmt) +{ + InitDbTableIfNeeded(); + DefElem *downer = NULL; + ListCell *option; + + foreach(option, stmt->options) + { + DefElem *defel = lfirst(option); + + if (strcmp(defel->defname, "owner") == 0) + downer = defel; + } + bool found = false; + DbEntry *entry = hash_search( + CurrentDdlTable->db_table, + stmt->dbname, + HASH_ENTER, + &found); + + if (!found) + memset(entry->old_name, 0, sizeof(entry->old_name)); + + entry->type = Op_Set; + if (downer && downer->arg) + entry->owner = get_role_oid(defGetString(downer), false); + else + entry->owner = GetUserId(); +} + +static void +HandleAlterOwner(AlterOwnerStmt *stmt) +{ + if (stmt->objectType != OBJECT_DATABASE) + return; + InitDbTableIfNeeded(); + const char *name = strVal(stmt->object); + bool found = false; + DbEntry *entry = hash_search( + CurrentDdlTable->db_table, + name, + HASH_ENTER, + &found); + + if (!found) + memset(entry->old_name, 0, sizeof(entry->old_name)); + + entry->owner = get_role_oid(get_rolespec_name(stmt->newowner), false); + entry->type = Op_Set; +} + +static void +HandleDbRename(RenameStmt *stmt) +{ + Assert(stmt->renameType == OBJECT_DATABASE); + InitDbTableIfNeeded(); + bool found = false; + DbEntry *entry = hash_search( + CurrentDdlTable->db_table, + stmt->subname, + HASH_FIND, + &found); + DbEntry *entry_for_new_name = hash_search( + CurrentDdlTable->db_table, + stmt->newname, + HASH_ENTER, + NULL); + + entry_for_new_name->type = Op_Set; + if (found) + { + if (entry->old_name[0] != '\0') + strlcpy(entry_for_new_name->old_name, entry->old_name, NAMEDATALEN); + else + strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN); + entry_for_new_name->owner = entry->owner; + hash_search( + CurrentDdlTable->db_table, + stmt->subname, + HASH_REMOVE, + NULL); + } + else + { + strlcpy(entry_for_new_name->old_name, stmt->subname, NAMEDATALEN); + entry_for_new_name->owner = InvalidOid; + } +} + +static void +HandleDropDb(DropdbStmt *stmt) +{ + InitDbTableIfNeeded(); + bool found = false; + DbEntry *entry = hash_search( + CurrentDdlTable->db_table, + stmt->dbname, + HASH_ENTER, + &found); + + entry->type = Op_Delete; + entry->owner = InvalidOid; + if (!found) + memset(entry->old_name, 0, sizeof(entry->old_name)); +} + +static void +HandleCreateRole(CreateRoleStmt *stmt) +{ + InitRoleTableIfNeeded(); + bool found = false; + RoleEntry *entry = hash_search( + CurrentDdlTable->role_table, + stmt->role, + HASH_ENTER, + &found); + DefElem *dpass = NULL; + ListCell *option; + + foreach(option, stmt->options) + { + DefElem *defel = lfirst(option); + + if (strcmp(defel->defname, "password") == 0) + dpass = defel; + } + if (!found) + memset(entry->old_name, 0, sizeof(entry->old_name)); + if (dpass && dpass->arg) + entry->password = MemoryContextStrdup(CurTransactionContext, strVal(dpass->arg)); + else + entry->password = NULL; + entry->type = Op_Set; +} + +static void +HandleAlterRole(AlterRoleStmt *stmt) +{ + InitRoleTableIfNeeded(); + DefElem *dpass = NULL; + ListCell *option; + + foreach(option, stmt->options) + { + DefElem *defel = lfirst(option); + + if (strcmp(defel->defname, "password") == 0) + dpass = defel; + } + /* We only care about updates to the password */ + if (!dpass) + return; + bool found = false; + RoleEntry *entry = hash_search( + CurrentDdlTable->role_table, + stmt->role->rolename, + HASH_ENTER, + &found); + + if (!found) + memset(entry->old_name, 0, sizeof(entry->old_name)); + if (dpass->arg) + entry->password = MemoryContextStrdup(CurTransactionContext, strVal(dpass->arg)); + else + entry->password = NULL; + entry->type = Op_Set; +} + +static void +HandleRoleRename(RenameStmt *stmt) +{ + InitRoleTableIfNeeded(); + Assert(stmt->renameType == OBJECT_ROLE); + bool found = false; + RoleEntry *entry = hash_search( + CurrentDdlTable->role_table, + stmt->subname, + HASH_FIND, + &found); + + RoleEntry *entry_for_new_name = hash_search( + CurrentDdlTable->role_table, + stmt->newname, + HASH_ENTER, + NULL); + + entry_for_new_name->type = Op_Set; + if (found) + { + if (entry->old_name[0] != '\0') + strlcpy(entry_for_new_name->old_name, entry->old_name, NAMEDATALEN); + else + strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN); + entry_for_new_name->password = entry->password; + hash_search( + CurrentDdlTable->role_table, + entry->name, + HASH_REMOVE, + NULL); + } + else + { + strlcpy(entry_for_new_name->old_name, stmt->subname, NAMEDATALEN); + entry_for_new_name->password = NULL; + } +} + +static void +HandleDropRole(DropRoleStmt *stmt) +{ + InitRoleTableIfNeeded(); + ListCell *item; + + foreach(item, stmt->roles) + { + RoleSpec *spec = lfirst(item); + bool found = false; + RoleEntry *entry = hash_search( + CurrentDdlTable->role_table, + spec->rolename, + HASH_ENTER, + &found); + + entry->type = Op_Delete; + entry->password = NULL; + if (!found) + memset(entry->old_name, 0, sizeof(entry)); + } +} + +static void +HandleRename(RenameStmt *stmt) +{ + if (stmt->renameType == OBJECT_DATABASE) + return HandleDbRename(stmt); + else if (stmt->renameType == OBJECT_ROLE) + return HandleRoleRename(stmt); +} + +static void +NeonProcessUtility( + PlannedStmt *pstmt, + const char *queryString, + bool readOnlyTree, + ProcessUtilityContext context, + ParamListInfo params, + QueryEnvironment *queryEnv, + DestReceiver *dest, + QueryCompletion *qc) +{ + Node *parseTree = pstmt->utilityStmt; + + switch (nodeTag(parseTree)) + { + case T_CreatedbStmt: + HandleCreateDb(castNode(CreatedbStmt, parseTree)); + break; + case T_AlterOwnerStmt: + HandleAlterOwner(castNode(AlterOwnerStmt, parseTree)); + break; + case T_RenameStmt: + HandleRename(castNode(RenameStmt, parseTree)); + break; + case T_DropdbStmt: + HandleDropDb(castNode(DropdbStmt, parseTree)); + break; + case T_CreateRoleStmt: + HandleCreateRole(castNode(CreateRoleStmt, parseTree)); + break; + case T_AlterRoleStmt: + HandleAlterRole(castNode(AlterRoleStmt, parseTree)); + break; + case T_DropRoleStmt: + HandleDropRole(castNode(DropRoleStmt, parseTree)); + break; + default: + break; + } + + if (PreviousProcessUtilityHook) + { + PreviousProcessUtilityHook( + pstmt, + queryString, + readOnlyTree, + context, + params, + queryEnv, + dest, + qc); + } + else + { + standard_ProcessUtility( + pstmt, + queryString, + readOnlyTree, + context, + params, + queryEnv, + dest, + qc); + } +} + +extern void +InitControlPlaneConnector() +{ + PreviousProcessUtilityHook = ProcessUtility_hook; + ProcessUtility_hook = NeonProcessUtility; + RegisterXactCallback(NeonXactCallback, NULL); + RegisterSubXactCallback(NeonSubXactCallback, NULL); + + DefineCustomStringVariable( + "neon.console_url", + "URL of the Neon Console, which will be forwarded changes to dbs and roles", + NULL, + &ConsoleURL, + NULL, + PGC_POSTMASTER, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable( + "neon.forward_ddl", + "Controls whether to forward DDL to the control plane", + NULL, + &ForwardDDL, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + const char *jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN"); + + if (!jwt_token) + { + elog(LOG, "Missing NEON_CONTROL_PLANE_TOKEN environment variable, forwarding will not be authenticated"); + } + + if (curl_global_init(CURL_GLOBAL_DEFAULT)) + { + elog(ERROR, "Failed to initialize curl"); + } + if ((CurlHandle = curl_easy_init()) == NULL) + { + elog(ERROR, "Failed to initialize curl handle"); + } + if ((ContentHeader = curl_slist_append(ContentHeader, "Content-Type: application/json")) == NULL) + { + elog(ERROR, "Failed to initialize content header"); + } + + if (jwt_token) + { + char auth_header[8192]; + + snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token); + if ((ContentHeader = curl_slist_append(ContentHeader, auth_header)) == NULL) + { + elog(ERROR, "Failed to initialize authorization header"); + } + } +} diff --git a/pgxn/neon/control_plane_connector.h b/pgxn/neon/control_plane_connector.h new file mode 100644 index 0000000000..12d6a97562 --- /dev/null +++ b/pgxn/neon/control_plane_connector.h @@ -0,0 +1,6 @@ +#ifndef CONTROL_PLANE_CONNECTOR_H +#define CONTROL_PLANE_CONNECTOR_H + +void InitControlPlaneConnector(); + +#endif diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 217c1974a0..b45d7cfc32 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -25,6 +25,7 @@ #include "neon.h" #include "walproposer.h" #include "pagestore_client.h" +#include "control_plane_connector.h" PG_MODULE_MAGIC; void _PG_init(void); @@ -34,7 +35,11 @@ _PG_init(void) { pg_init_libpagestore(); pg_init_walproposer(); + InitControlPlaneConnector(); + // Important: This must happen after other parts of the extension + // are loaded, otherwise any settings to GUCs that were set before + // the extension was loaded will be removed. EmitWarningsOnPlaceholders("neon"); } diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index a99be40955..64d980d2e4 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -254,20 +254,20 @@ nwp_register_gucs(void) DefineCustomIntVariable( "neon.safekeeper_reconnect_timeout", - "Timeout for reconnecting to offline wal acceptor.", + "Walproposer reconnects to offline safekeepers once in this interval.", NULL, &wal_acceptor_reconnect_timeout, - 1000, 0, INT_MAX, /* default, min, max */ + 5000, 0, INT_MAX, /* default, min, max */ PGC_SIGHUP, /* context */ GUC_UNIT_MS, /* flags */ NULL, NULL, NULL); DefineCustomIntVariable( "neon.safekeeper_connect_timeout", - "Timeout for connection establishement and it's maintenance against safekeeper", + "Connection or connection attempt to safekeeper is terminated if no message is received (or connection attempt doesn't finish) within this period.", NULL, &wal_acceptor_connection_timeout, - 5000, 0, INT_MAX, + 10000, 0, INT_MAX, PGC_SIGHUP, GUC_UNIT_MS, NULL, NULL, NULL); @@ -441,7 +441,7 @@ WalProposerPoll(void) if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now, wal_acceptor_connection_timeout)) { - elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms", + elog(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that", sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout); ShutdownConnection(sk); } @@ -1035,9 +1035,16 @@ RecvAcceptorGreeting(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->greetResponse)) return; + elog(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port); + /* Protocol is all good, move to voting. */ sk->state = SS_VOTING; + /* + * Note: it would be better to track the counter on per safekeeper basis, + * but at worst walproposer would restart with 'term rejected', so leave as + * is for now. + */ ++n_connected; if (n_connected <= quorum) { diff --git a/poetry.lock b/poetry.lock index 23884f6252..8dc45f68b8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -79,30 +79,30 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"] [[package]] name = "allure-pytest" -version = "2.13.1" +version = "2.13.2" description = "Allure pytest integration" category = "main" optional = false python-versions = "*" files = [ - {file = "allure-pytest-2.13.1.tar.gz", hash = "sha256:68d69456eeb65af4061ec06a80bc941163b0616e8216554d36b070a6bf070e08"}, - {file = "allure_pytest-2.13.1-py3-none-any.whl", hash = "sha256:a8de2fc3b3effe2d8f98801646920de3f055b779710f4c806dbee7c613c24633"}, + {file = "allure-pytest-2.13.2.tar.gz", hash = "sha256:22243159e8ec81ce2b5254b4013802198821b1b42f118f69d4a289396607c7b3"}, + {file = "allure_pytest-2.13.2-py3-none-any.whl", hash = "sha256:17de9dbee7f61c8e66a5b5e818b00e419dbcea44cb55c24319401ba813220690"}, ] [package.dependencies] -allure-python-commons = "2.13.1" +allure-python-commons = "2.13.2" pytest = ">=4.5.0" [[package]] name = "allure-python-commons" -version = "2.13.1" +version = "2.13.2" description = "Common module for integrate allure with python-based frameworks" category = "main" optional = false python-versions = ">=3.6" files = [ - {file = "allure-python-commons-2.13.1.tar.gz", hash = "sha256:3fc13e1da8ebb23f9ab5c9c72ad04595023cdd5078dbb8604939997faebed5cb"}, - {file = "allure_python_commons-2.13.1-py3-none-any.whl", hash = "sha256:d08e04867bddf44fef55def3d67f4bc25af58a1bf9fcffcf4ec3331f7f2ef0d0"}, + {file = "allure-python-commons-2.13.2.tar.gz", hash = "sha256:8a03681330231b1deadd86b97ff68841c6591320114ae638570f1ed60d7a2033"}, + {file = "allure_python_commons-2.13.2-py3-none-any.whl", hash = "sha256:2bb3646ec3fbf5b36d178a5e735002bc130ae9f9ba80f080af97d368ba375051"}, ] [package.dependencies] @@ -172,17 +172,6 @@ dev = ["Cython (>=0.29.24,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "flake8 (>=5.0.4 docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] test = ["flake8 (>=5.0.4,<5.1.0)", "uvloop (>=0.15.3)"] -[[package]] -name = "atomicwrites" -version = "1.4.1" -description = "Atomic file writes." -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"}, -] - [[package]] name = "attrs" version = "21.4.0" @@ -239,49 +228,49 @@ wrapt = "*" [[package]] name = "backoff" -version = "1.11.1" +version = "2.2.1" description = "Function decoration for backoff and retry" category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = ">=3.7,<4.0" files = [ - {file = "backoff-1.11.1-py2.py3-none-any.whl", hash = "sha256:61928f8fa48d52e4faa81875eecf308eccfb1016b018bb6bd21e05b5d90a96c5"}, - {file = "backoff-1.11.1.tar.gz", hash = "sha256:ccb962a2378418c667b3c979b504fdeb7d9e0d29c0579e3b13b86467177728cb"}, + {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"}, + {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, ] [[package]] name = "black" -version = "23.1.0" +version = "23.3.0" description = "The uncompromising code formatter." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "black-23.1.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:b6a92a41ee34b883b359998f0c8e6eb8e99803aa8bf3123bf2b2e6fec505a221"}, - {file = "black-23.1.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:57c18c5165c1dbe291d5306e53fb3988122890e57bd9b3dcb75f967f13411a26"}, - {file = "black-23.1.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:9880d7d419bb7e709b37e28deb5e68a49227713b623c72b2b931028ea65f619b"}, - {file = "black-23.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6663f91b6feca5d06f2ccd49a10f254f9298cc1f7f49c46e498a0771b507104"}, - {file = "black-23.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9afd3f493666a0cd8f8df9a0200c6359ac53940cbde049dcb1a7eb6ee2dd7074"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:bfffba28dc52a58f04492181392ee380e95262af14ee01d4bc7bb1b1c6ca8d27"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c1c476bc7b7d021321e7d93dc2cbd78ce103b84d5a4cf97ed535fbc0d6660648"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:382998821f58e5c8238d3166c492139573325287820963d2f7de4d518bd76958"}, - {file = "black-23.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf649fda611c8550ca9d7592b69f0637218c2369b7744694c5e4902873b2f3a"}, - {file = "black-23.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:121ca7f10b4a01fd99951234abdbd97728e1240be89fde18480ffac16503d481"}, - {file = "black-23.1.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:a8471939da5e824b891b25751955be52ee7f8a30a916d570a5ba8e0f2eb2ecad"}, - {file = "black-23.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8178318cb74f98bc571eef19068f6ab5613b3e59d4f47771582f04e175570ed8"}, - {file = "black-23.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a436e7881d33acaf2536c46a454bb964a50eff59b21b51c6ccf5a40601fbef24"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:a59db0a2094d2259c554676403fa2fac3473ccf1354c1c63eccf7ae65aac8ab6"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:0052dba51dec07ed029ed61b18183942043e00008ec65d5028814afaab9a22fd"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:49f7b39e30f326a34b5c9a4213213a6b221d7ae9d58ec70df1c4a307cf2a1580"}, - {file = "black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:162e37d49e93bd6eb6f1afc3e17a3d23a823042530c37c3c42eeeaf026f38468"}, - {file = "black-23.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b70eb40a78dfac24842458476135f9b99ab952dd3f2dab738c1881a9b38b753"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:a29650759a6a0944e7cca036674655c2f0f63806ddecc45ed40b7b8aa314b651"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:bb460c8561c8c1bec7824ecbc3ce085eb50005883a6203dcfb0122e95797ee06"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c91dfc2c2a4e50df0026f88d2215e166616e0c80e86004d0003ece0488db2739"}, - {file = "black-23.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a951cc83ab535d248c89f300eccbd625e80ab880fbcfb5ac8afb5f01a258ac9"}, - {file = "black-23.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0680d4380db3719ebcfb2613f34e86c8e6d15ffeabcf8ec59355c5e7b85bb555"}, - {file = "black-23.1.0-py3-none-any.whl", hash = "sha256:7a0f701d314cfa0896b9001df70a530eb2472babb76086344e688829efd97d32"}, - {file = "black-23.1.0.tar.gz", hash = "sha256:b0bd97bea8903f5a2ba7219257a44e3f1f9d00073d6cc1add68f0beec69692ac"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"}, + {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"}, + {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"}, + {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"}, + {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"}, + {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"}, + {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"}, + {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"}, + {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"}, + {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"}, + {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"}, + {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"}, + {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"}, + {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"}, ] [package.dependencies] @@ -866,35 +855,31 @@ files = [ [[package]] name = "cryptography" -version = "39.0.1" +version = "41.0.0" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:6687ef6d0a6497e2b58e7c5b852b53f62142cfa7cd1555795758934da363a965"}, - {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:706843b48f9a3f9b9911979761c91541e3d90db1ca905fd63fee540a217698bc"}, - {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5d2d8b87a490bfcd407ed9d49093793d0f75198a35e6eb1a923ce1ee86c62b41"}, - {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e17b26de248c33f3acffb922748151d71827d6021d98c70e6c1a25ddd78505"}, - {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e124352fd3db36a9d4a21c1aa27fd5d051e621845cb87fb851c08f4f75ce8be6"}, - {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:5aa67414fcdfa22cf052e640cb5ddc461924a045cacf325cd164e65312d99502"}, - {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:35f7c7d015d474f4011e859e93e789c87d21f6f4880ebdc29896a60403328f1f"}, - {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f24077a3b5298a5a06a8e0536e3ea9ec60e4c7ac486755e5fb6e6ea9b3500106"}, - {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f0c64d1bd842ca2633e74a1a28033d139368ad959872533b1bab8c80e8240a0c"}, - {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:0f8da300b5c8af9f98111ffd512910bc792b4c77392a9523624680f7956a99d4"}, - {file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"}, - {file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"}, - {file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"}, - {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5caeb8188c24888c90b5108a441c106f7faa4c4c075a2bcae438c6e8ca73cef"}, - {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4789d1e3e257965e960232345002262ede4d094d1a19f4d3b52e48d4d8f3b885"}, - {file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"}, - {file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"}, - {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"}, - {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:6f8ba7f0328b79f08bdacc3e4e66fb4d7aab0c3584e0bd41328dce5262e26b2e"}, - {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ef8b72fa70b348724ff1218267e7f7375b8de4e8194d1636ee60510aae104cd0"}, - {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:aec5a6c9864be7df2240c382740fcf3b96928c46604eaa7f3091f58b878c0bb6"}, - {file = "cryptography-39.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdd188c8a6ef8769f148f88f859884507b954cc64db6b52f66ef199bb9ad660a"}, - {file = "cryptography-39.0.1.tar.gz", hash = "sha256:d1f6198ee6d9148405e49887803907fe8962a23e6c6f83ea7d98f1c0de375695"}, + {file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:3c5ef25d060c80d6d9f7f9892e1d41bb1c79b78ce74805b8cb4aa373cb7d5ec8"}, + {file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8362565b3835ceacf4dc8f3b56471a2289cf51ac80946f9087e66dc283a810e0"}, + {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3680248309d340fda9611498a5319b0193a8dbdb73586a1acf8109d06f25b92d"}, + {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84a165379cb9d411d58ed739e4af3396e544eac190805a54ba2e0322feb55c46"}, + {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:4ab14d567f7bbe7f1cdff1c53d5324ed4d3fc8bd17c481b395db224fb405c237"}, + {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9f65e842cb02550fac96536edb1d17f24c0a338fd84eaf582be25926e993dde4"}, + {file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:b7f2f5c525a642cecad24ee8670443ba27ac1fab81bba4cc24c7b6b41f2d0c75"}, + {file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:7d92f0248d38faa411d17f4107fc0bce0c42cae0b0ba5415505df72d751bf62d"}, + {file = "cryptography-41.0.0-cp37-abi3-win32.whl", hash = "sha256:34d405ea69a8b34566ba3dfb0521379b210ea5d560fafedf9f800a9a94a41928"}, + {file = "cryptography-41.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:344c6de9f8bda3c425b3a41b319522ba3208551b70c2ae00099c205f0d9fd3be"}, + {file = "cryptography-41.0.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:88ff107f211ea696455ea8d911389f6d2b276aabf3231bf72c8853d22db755c5"}, + {file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b846d59a8d5a9ba87e2c3d757ca019fa576793e8758174d3868aecb88d6fc8eb"}, + {file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5d0bf9b252f30a31664b6f64432b4730bb7038339bd18b1fafe129cfc2be9be"}, + {file = "cryptography-41.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5c1f7293c31ebc72163a9a0df246f890d65f66b4a40d9ec80081969ba8c78cc9"}, + {file = "cryptography-41.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bf8fc66012ca857d62f6a347007e166ed59c0bc150cefa49f28376ebe7d992a2"}, + {file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a4fc68d1c5b951cfb72dfd54702afdbbf0fb7acdc9b7dc4301bbf2225a27714d"}, + {file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:14754bcdae909d66ff24b7b5f166d69340ccc6cb15731670435efd5719294895"}, + {file = "cryptography-41.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0ddaee209d1cf1f180f1efa338a68c4621154de0afaef92b89486f5f96047c55"}, + {file = "cryptography-41.0.0.tar.gz", hash = "sha256:6b71f64beeea341c9b4f963b48ee3b62d62d57ba93eb120e1196b31dc1025e78"}, ] [package.dependencies] @@ -903,12 +888,12 @@ cffi = ">=1.12" [package.extras] docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] -pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"] -sdist = ["setuptools-rust (>=0.11.4)"] +nox = ["nox"] +pep8test = ["black", "check-sdist", "mypy", "ruff"] +sdist = ["build"] ssh = ["bcrypt (>=3.1.5)"] -test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"] +test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] test-randomorder = ["pytest-randomly"] -tox = ["tox"] [[package]] name = "docker" @@ -951,6 +936,21 @@ six = ">=1.9.0" gmpy = ["gmpy"] gmpy2 = ["gmpy2"] +[[package]] +name = "exceptiongroup" +version = "1.1.1" +description = "Backport of PEP 654 (exception groups)" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, + {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, +] + +[package.extras] +test = ["pytest (>=6)"] + [[package]] name = "execnet" version = "1.9.0" @@ -1410,38 +1410,38 @@ files = [ [[package]] name = "mypy" -version = "1.1.1" +version = "1.3.0" description = "Optional static typing for Python" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mypy-1.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39c7119335be05630611ee798cc982623b9e8f0cff04a0b48dfc26100e0b97af"}, - {file = "mypy-1.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:61bf08362e93b6b12fad3eab68c4ea903a077b87c90ac06c11e3d7a09b56b9c1"}, - {file = "mypy-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbb19c9f662e41e474e0cff502b7064a7edc6764f5262b6cd91d698163196799"}, - {file = "mypy-1.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:315ac73cc1cce4771c27d426b7ea558fb4e2836f89cb0296cbe056894e3a1f78"}, - {file = "mypy-1.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:5cb14ff9919b7df3538590fc4d4c49a0f84392237cbf5f7a816b4161c061829e"}, - {file = "mypy-1.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:26cdd6a22b9b40b2fd71881a8a4f34b4d7914c679f154f43385ca878a8297389"}, - {file = "mypy-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5b5f81b40d94c785f288948c16e1f2da37203c6006546c5d947aab6f90aefef2"}, - {file = "mypy-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21b437be1c02712a605591e1ed1d858aba681757a1e55fe678a15c2244cd68a5"}, - {file = "mypy-1.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d809f88734f44a0d44959d795b1e6f64b2bbe0ea4d9cc4776aa588bb4229fc1c"}, - {file = "mypy-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:a380c041db500e1410bb5b16b3c1c35e61e773a5c3517926b81dfdab7582be54"}, - {file = "mypy-1.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b7c7b708fe9a871a96626d61912e3f4ddd365bf7f39128362bc50cbd74a634d5"}, - {file = "mypy-1.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c10fa12df1232c936830839e2e935d090fc9ee315744ac33b8a32216b93707"}, - {file = "mypy-1.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0a28a76785bf57655a8ea5eb0540a15b0e781c807b5aa798bd463779988fa1d5"}, - {file = "mypy-1.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:ef6a01e563ec6a4940784c574d33f6ac1943864634517984471642908b30b6f7"}, - {file = "mypy-1.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d64c28e03ce40d5303450f547e07418c64c241669ab20610f273c9e6290b4b0b"}, - {file = "mypy-1.1.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:64cc3afb3e9e71a79d06e3ed24bb508a6d66f782aff7e56f628bf35ba2e0ba51"}, - {file = "mypy-1.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce61663faf7a8e5ec6f456857bfbcec2901fbdb3ad958b778403f63b9e606a1b"}, - {file = "mypy-1.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2b0c373d071593deefbcdd87ec8db91ea13bd8f1328d44947e88beae21e8d5e9"}, - {file = "mypy-1.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:2888ce4fe5aae5a673386fa232473014056967f3904f5abfcf6367b5af1f612a"}, - {file = "mypy-1.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:19ba15f9627a5723e522d007fe708007bae52b93faab00f95d72f03e1afa9598"}, - {file = "mypy-1.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:59bbd71e5c58eed2e992ce6523180e03c221dcd92b52f0e792f291d67b15a71c"}, - {file = "mypy-1.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9401e33814cec6aec8c03a9548e9385e0e228fc1b8b0a37b9ea21038e64cdd8a"}, - {file = "mypy-1.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4b398d8b1f4fba0e3c6463e02f8ad3346f71956b92287af22c9b12c3ec965a9f"}, - {file = "mypy-1.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:69b35d1dcb5707382810765ed34da9db47e7f95b3528334a3c999b0c90fe523f"}, - {file = "mypy-1.1.1-py3-none-any.whl", hash = "sha256:4e4e8b362cdf99ba00c2b218036002bdcdf1e0de085cdb296a49df03fb31dfc4"}, - {file = "mypy-1.1.1.tar.gz", hash = "sha256:ae9ceae0f5b9059f33dbc62dea087e942c0ccab4b7a003719cb70f9b8abfa32f"}, + {file = "mypy-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c1eb485cea53f4f5284e5baf92902cd0088b24984f4209e25981cc359d64448d"}, + {file = "mypy-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4c99c3ecf223cf2952638da9cd82793d8f3c0c5fa8b6ae2b2d9ed1e1ff51ba85"}, + {file = "mypy-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:550a8b3a19bb6589679a7c3c31f64312e7ff482a816c96e0cecec9ad3a7564dd"}, + {file = "mypy-1.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cbc07246253b9e3d7d74c9ff948cd0fd7a71afcc2b77c7f0a59c26e9395cb152"}, + {file = "mypy-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:a22435632710a4fcf8acf86cbd0d69f68ac389a3892cb23fbad176d1cddaf228"}, + {file = "mypy-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6e33bb8b2613614a33dff70565f4c803f889ebd2f859466e42b46e1df76018dd"}, + {file = "mypy-1.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7d23370d2a6b7a71dc65d1266f9a34e4cde9e8e21511322415db4b26f46f6b8c"}, + {file = "mypy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:658fe7b674769a0770d4b26cb4d6f005e88a442fe82446f020be8e5f5efb2fae"}, + {file = "mypy-1.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6e42d29e324cdda61daaec2336c42512e59c7c375340bd202efa1fe0f7b8f8ca"}, + {file = "mypy-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:d0b6c62206e04061e27009481cb0ec966f7d6172b5b936f3ead3d74f29fe3dcf"}, + {file = "mypy-1.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:76ec771e2342f1b558c36d49900dfe81d140361dd0d2df6cd71b3db1be155409"}, + {file = "mypy-1.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc95f8386314272bbc817026f8ce8f4f0d2ef7ae44f947c4664efac9adec929"}, + {file = "mypy-1.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:faff86aa10c1aa4a10e1a301de160f3d8fc8703b88c7e98de46b531ff1276a9a"}, + {file = "mypy-1.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8c5979d0deb27e0f4479bee18ea0f83732a893e81b78e62e2dda3e7e518c92ee"}, + {file = "mypy-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c5d2cc54175bab47011b09688b418db71403aefad07cbcd62d44010543fc143f"}, + {file = "mypy-1.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:87df44954c31d86df96c8bd6e80dfcd773473e877ac6176a8e29898bfb3501cb"}, + {file = "mypy-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:473117e310febe632ddf10e745a355714e771ffe534f06db40702775056614c4"}, + {file = "mypy-1.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:74bc9b6e0e79808bf8678d7678b2ae3736ea72d56eede3820bd3849823e7f305"}, + {file = "mypy-1.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:44797d031a41516fcf5cbfa652265bb994e53e51994c1bd649ffcd0c3a7eccbf"}, + {file = "mypy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ddae0f39ca146972ff6bb4399f3b2943884a774b8771ea0a8f50e971f5ea5ba8"}, + {file = "mypy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1c4c42c60a8103ead4c1c060ac3cdd3ff01e18fddce6f1016e08939647a0e703"}, + {file = "mypy-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e86c2c6852f62f8f2b24cb7a613ebe8e0c7dc1402c61d36a609174f63e0ff017"}, + {file = "mypy-1.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f9dca1e257d4cc129517779226753dbefb4f2266c4eaad610fc15c6a7e14283e"}, + {file = "mypy-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:95d8d31a7713510685b05fbb18d6ac287a56c8f6554d88c19e73f724a445448a"}, + {file = "mypy-1.3.0-py3-none-any.whl", hash = "sha256:a8763e72d5d9574d45ce5881962bc8e9046bf7b375b0abf031f3e6811732a897"}, + {file = "mypy-1.3.0.tar.gz", hash = "sha256:e1f4d16e296f5135624b34e8fb741eb0eadedca90862405b1f1fde2040b9bd11"}, ] [package.dependencies] @@ -1721,18 +1721,6 @@ files = [ {file = "psycopg2_binary-2.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:accfe7e982411da3178ec690baaceaad3c278652998b2c45828aaac66cd8285f"}, ] -[[package]] -name = "py" -version = "1.11.0" -description = "library with cross-python path, ini-parsing, io, code, log facilities" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -files = [ - {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, - {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, -] - [[package]] name = "pyasn1" version = "0.4.8" @@ -1841,57 +1829,56 @@ files = [ [[package]] name = "pytest" -version = "6.2.5" +version = "7.3.1" description = "pytest: simple powerful testing with Python" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"}, - {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"}, + {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"}, + {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"}, ] [package.dependencies] -atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} -attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" -py = ">=1.8.2" -toml = "*" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] [[package]] name = "pytest-asyncio" -version = "0.19.0" +version = "0.21.0" description = "Pytest support for asyncio" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-asyncio-0.19.0.tar.gz", hash = "sha256:ac4ebf3b6207259750bc32f4c1d8fcd7e79739edbc67ad0c58dd150b1d072fed"}, - {file = "pytest_asyncio-0.19.0-py3-none-any.whl", hash = "sha256:7a97e37cfe1ed296e2e84941384bdd37c376453912d397ed39293e0916f521fa"}, + {file = "pytest-asyncio-0.21.0.tar.gz", hash = "sha256:2b38a496aef56f56b0e87557ec313e11e1ab9276fc3863f6a7be0f1d0e415e1b"}, + {file = "pytest_asyncio-0.21.0-py3-none-any.whl", hash = "sha256:f2b3366b7cd501a4056858bd39349d5af19742aed2d81660b7998b6341c7eb9c"}, ] [package.dependencies] -pytest = ">=6.1.0" +pytest = ">=7.0.0" [package.extras] +docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"] testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"] [[package]] name = "pytest-httpserver" -version = "1.0.6" +version = "1.0.8" description = "pytest-httpserver is a httpserver for pytest" category = "main" optional = false -python-versions = ">=3.7,<4.0" +python-versions = ">=3.8,<4.0" files = [ - {file = "pytest_httpserver-1.0.6-py3-none-any.whl", hash = "sha256:ac2379acc91fe8bdbe2911c93af8dd130e33b5899fb9934d15669480739c6d32"}, - {file = "pytest_httpserver-1.0.6.tar.gz", hash = "sha256:9040d07bf59ac45d8de3db1d4468fd2d1d607975e4da4c872ecc0402cdbf7b3e"}, + {file = "pytest_httpserver-1.0.8-py3-none-any.whl", hash = "sha256:24cd3d9f6a0b927c7bfc400d0b3fda7442721b8267ce29942bf307b190f0bb09"}, + {file = "pytest_httpserver-1.0.8.tar.gz", hash = "sha256:e052f69bc8a9073db02484681e8e47004dd1fb3763b0ae833bd899e5895c559a"}, ] [package.dependencies] @@ -1914,14 +1901,14 @@ pytest = ">=3.2.5" [[package]] name = "pytest-order" -version = "1.0.1" +version = "1.1.0" description = "pytest plugin to run your tests in a specific order" category = "main" optional = false python-versions = ">=3.6" files = [ - {file = "pytest-order-1.0.1.tar.gz", hash = "sha256:5dd6b929fbd7eaa6d0ee07586f65c623babb0afe72b4843c5f15055d6b3b1b1f"}, - {file = "pytest_order-1.0.1-py3-none-any.whl", hash = "sha256:bbe6e63a8e23741ab3e810d458d1ea7317e797b70f9550512d77d6e9e8fd1bbb"}, + {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"}, + {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"}, ] [package.dependencies] @@ -1963,14 +1950,14 @@ pytest = ">=5.0.0" [[package]] name = "pytest-xdist" -version = "3.0.2" -description = "pytest xdist plugin for distributed testing and loop-on-failing modes" +version = "3.3.1" +description = "pytest xdist plugin for distributed testing, most importantly across multiple CPUs" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "pytest-xdist-3.0.2.tar.gz", hash = "sha256:688da9b814370e891ba5de650c9327d1a9d861721a524eb917e620eec3e90291"}, - {file = "pytest_xdist-3.0.2-py3-none-any.whl", hash = "sha256:9feb9a18e1790696ea23e1434fa73b325ed4998b0e9fcb221f16fd1945e6df1b"}, + {file = "pytest-xdist-3.3.1.tar.gz", hash = "sha256:d5ee0520eb1b7bcca50a60a518ab7a7707992812c578198f8b44fdfac78e8c93"}, + {file = "pytest_xdist-3.3.1-py3-none-any.whl", hash = "sha256:ff9daa7793569e6a68544850fd3927cd257cc03a7ef76c95e86915355e82b5f2"}, ] [package.dependencies] @@ -2148,29 +2135,29 @@ pyasn1 = ">=0.1.3" [[package]] name = "ruff" -version = "0.0.255" +version = "0.0.269" description = "An extremely fast Python linter, written in Rust." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.0.255-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:b2d71fb6a7e50501a2473864acffc85dee6b750c25db198f7e71fe1dbbff1aad"}, - {file = "ruff-0.0.255-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:6c97d746861a6010f941179e84bba9feb8a871815667471d9ed6beb98d45c252"}, - {file = "ruff-0.0.255-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a7fa60085079b91a298b963361be9b1b1c724582af6c84be954cbabdbd9309a"}, - {file = "ruff-0.0.255-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c089f7141496334ab5a127b54ce55e41f0d6714e68a4453a1e09d2204cdea8c3"}, - {file = "ruff-0.0.255-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0423908caa7d437a416b853214565b9c33bbd1106c4f88147982216dddcbbd96"}, - {file = "ruff-0.0.255-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:981493e92547cacbb8e0874904ec049fe744507ee890dc8736caf89a8864f9a7"}, - {file = "ruff-0.0.255-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:59d5193d2aedb35db180824462b374dbcfc306b2e76076245088afa6e5837df2"}, - {file = "ruff-0.0.255-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd5e00733c9d160c8a34a22e62b390da9d1e9f326676402421cb8c1236beefc3"}, - {file = "ruff-0.0.255-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:694418cf41838bd19c6229e4e1b2d04505b1e6b86fe3ab81165484fc96d36f01"}, - {file = "ruff-0.0.255-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:5d0408985c9777369daebb5d3340a99e9f7294bdd7120642239261508185cf89"}, - {file = "ruff-0.0.255-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:abd6376ef9d12f370d95a8c7c98682fbb9bfedfba59f40e84a816fef8ddcb8de"}, - {file = "ruff-0.0.255-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f9b1a5df0bc09193cbef58a6f78e4a9a0b058a4f9733c0442866d078006d1bb9"}, - {file = "ruff-0.0.255-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6a25c5f4ff087445b2e1bbcb9963f2ae7c868d65e4a8d5f84c36c12f71571179"}, - {file = "ruff-0.0.255-py3-none-win32.whl", hash = "sha256:1ff87a8310354f9f1a099625e54a27fdd6756d9cd2a40b45922f2e943daf982d"}, - {file = "ruff-0.0.255-py3-none-win_amd64.whl", hash = "sha256:f3d8416be618f023f93ec4fd6ee3048585ef85dba9563b2a7e38fc7e5131d5b1"}, - {file = "ruff-0.0.255-py3-none-win_arm64.whl", hash = "sha256:8ba124819624145d7b6b53add40c367c44318893215ffc1bfe3d72e0225a1c9c"}, - {file = "ruff-0.0.255.tar.gz", hash = "sha256:f9eb1d3b2eecbeedae419fa494c4e2a5e4484baf93a1ce0f81eddb005e1919c5"}, + {file = "ruff-0.0.269-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:3569bcdee679045c09c0161fabc057599759c49219a08d9a4aad2cc3982ccba3"}, + {file = "ruff-0.0.269-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:56347da63757a56cbce7d4b3d6044ca4f1941cd1bbff3714f7554360c3361f83"}, + {file = "ruff-0.0.269-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6da8ee25ef2f0cc6cc8e6e20942c1d44d25a36dce35070d7184655bc14f63f63"}, + {file = "ruff-0.0.269-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd81b8e681b9eaa6cf15484f3985bd8bd97c3d114e95bff3e8ea283bf8865062"}, + {file = "ruff-0.0.269-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f19f59ca3c28742955241fb452f3346241ddbd34e72ac5cb3d84fadebcf6bc8"}, + {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:f062059b8289a4fab7f6064601b811d447c2f9d3d432a17f689efe4d68988450"}, + {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f5dc7aac52c58e82510217e3c7efd80765c134c097c2815d59e40face0d1fe6"}, + {file = "ruff-0.0.269-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e131b4dbe798c391090c6407641d6ab12c0fa1bb952379dde45e5000e208dabb"}, + {file = "ruff-0.0.269-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a374434e588e06550df0f8dcb74777290f285678de991fda4e1063c367ab2eb2"}, + {file = "ruff-0.0.269-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:cec2f4b84a14b87f1b121488649eb5b4eaa06467a2387373f750da74bdcb5679"}, + {file = "ruff-0.0.269-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:374b161753a247904aec7a32d45e165302b76b6e83d22d099bf3ff7c232c888f"}, + {file = "ruff-0.0.269-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9ca0a1ddb1d835b5f742db9711c6cf59f213a1ad0088cb1e924a005fd399e7d8"}, + {file = "ruff-0.0.269-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5a20658f0b97d207c7841c13d528f36d666bf445b00b01139f28a8ccb80093bb"}, + {file = "ruff-0.0.269-py3-none-win32.whl", hash = "sha256:03ff42bc91ceca58e0f0f072cb3f9286a9208f609812753474e799a997cdad1a"}, + {file = "ruff-0.0.269-py3-none-win_amd64.whl", hash = "sha256:f3b59ccff57b21ef0967ea8021fd187ec14c528ec65507d8bcbe035912050776"}, + {file = "ruff-0.0.269-py3-none-win_arm64.whl", hash = "sha256:bbeb857b1e508a4487bdb02ca1e6d41dd8d5ac5335a5246e25de8a3dff38c1ff"}, + {file = "ruff-0.0.269.tar.gz", hash = "sha256:11ddcfbab32cf5c420ea9dd5531170ace5a3e59c16d9251c7bd2581f7b16f602"}, ] [[package]] @@ -2271,7 +2258,7 @@ files = [ name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2281,42 +2268,54 @@ files = [ [[package]] name = "types-psutil" -version = "5.9.5.4" +version = "5.9.5.12" description = "Typing stubs for psutil" category = "main" optional = false python-versions = "*" files = [ - {file = "types-psutil-5.9.5.4.tar.gz", hash = "sha256:aa09102b80c65a3b4573216614372398dab78972d650488eaff1ff05482cc18f"}, - {file = "types_psutil-5.9.5.4-py3-none-any.whl", hash = "sha256:28e59764630187e462d43788efa16d59d5e77b510115f9e25901b2d4007fca62"}, + {file = "types-psutil-5.9.5.12.tar.gz", hash = "sha256:61a91679d3fe737250013b624dca09375e7cc3ad77dcc734553746c429c02aca"}, + {file = "types_psutil-5.9.5.12-py3-none-any.whl", hash = "sha256:e9a147b8561235c6afcce5aa1adb973fad9ab2c50cf89820697687f53510358f"}, ] [[package]] name = "types-psycopg2" -version = "2.9.18" +version = "2.9.21.10" description = "Typing stubs for psycopg2" category = "main" optional = false python-versions = "*" files = [ - {file = "types-psycopg2-2.9.18.tar.gz", hash = "sha256:9b0e9e1f097b15cd9fa8aad2596a9e3082fd72f8d9cfe52b190cfa709105b6c0"}, - {file = "types_psycopg2-2.9.18-py3-none-any.whl", hash = "sha256:14c779dcab18c31453fa1cad3cf4b1601d33540a344adead3c47a6b8091cd2fa"}, + {file = "types-psycopg2-2.9.21.10.tar.gz", hash = "sha256:c2600892312ae1c34e12f145749795d93dc4eac3ef7dbf8a9c1bfd45385e80d7"}, + {file = "types_psycopg2-2.9.21.10-py3-none-any.whl", hash = "sha256:918224a0731a3650832e46633e720703b5beef7693a064e777d9748654fcf5e5"}, +] + +[[package]] +name = "types-pytest-lazy-fixture" +version = "0.6.3.3" +description = "Typing stubs for pytest-lazy-fixture" +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "types-pytest-lazy-fixture-0.6.3.3.tar.gz", hash = "sha256:2ef79d66bcde0e50acdac8dc55074b9ae0d4cfaeabdd638f5522f4cac7c8a2c7"}, + {file = "types_pytest_lazy_fixture-0.6.3.3-py3-none-any.whl", hash = "sha256:a56a55649147ff960ff79d4b2c781a4f769351abc1876873f3116d0bd0c96353"}, ] [[package]] name = "types-requests" -version = "2.28.5" +version = "2.31.0.0" description = "Typing stubs for requests" category = "main" optional = false python-versions = "*" files = [ - {file = "types-requests-2.28.5.tar.gz", hash = "sha256:ac618bfefcb3742eaf97c961e13e9e5a226e545eda4a3dbe293b898d40933ad1"}, - {file = "types_requests-2.28.5-py3-none-any.whl", hash = "sha256:98ab647ae88b5e2c41d6d20cfcb5117da1bea561110000b6fdeeea07b3e89877"}, + {file = "types-requests-2.31.0.0.tar.gz", hash = "sha256:c1c29d20ab8d84dff468d7febfe8e0cb0b4664543221b386605e14672b44ea25"}, + {file = "types_requests-2.31.0.0-py3-none-any.whl", hash = "sha256:7c5cea7940f8e92ec560bbc468f65bf684aa3dcf0554a6f8c4710f5f708dc598"}, ] [package.dependencies] -types-urllib3 = "<1.27" +types-urllib3 = "*" [[package]] name = "types-s3transfer" @@ -2332,14 +2331,14 @@ files = [ [[package]] name = "types-toml" -version = "0.10.8" +version = "0.10.8.6" description = "Typing stubs for toml" category = "main" optional = false python-versions = "*" files = [ - {file = "types-toml-0.10.8.tar.gz", hash = "sha256:b7e7ea572308b1030dc86c3ba825c5210814c2825612ec679eb7814f8dd9295a"}, - {file = "types_toml-0.10.8-py3-none-any.whl", hash = "sha256:8300fd093e5829eb9c1fba69cee38130347d4b74ddf32d0a7df650ae55c2b599"}, + {file = "types-toml-0.10.8.6.tar.gz", hash = "sha256:6d3ac79e36c9ee593c5d4fb33a50cca0e3adceb6ef5cff8b8e5aef67b4c4aaf2"}, + {file = "types_toml-0.10.8.6-py3-none-any.whl", hash = "sha256:de7b2bb1831d6f7a4b554671ffe5875e729753496961b3e9b202745e4955dafa"}, ] [[package]] @@ -2356,14 +2355,14 @@ files = [ [[package]] name = "typing-extensions" -version = "4.3.0" +version = "4.6.1" description = "Backported and Experimental Type Hints for Python 3.7+" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "typing_extensions-4.3.0-py3-none-any.whl", hash = "sha256:25642c956049920a5aa49edcdd6ab1e06d7e5d467fc00e0506c44ac86fbfca02"}, - {file = "typing_extensions-4.3.0.tar.gz", hash = "sha256:e6d2677a32f47fc7eb2795db1dd15c1f34eff616bcaf2cfb5e997f854fa1c4a6"}, + {file = "typing_extensions-4.6.1-py3-none-any.whl", hash = "sha256:6bac751f4789b135c43228e72de18637e9a6c29d12777023a703fd1a6858469f"}, + {file = "typing_extensions-4.6.1.tar.gz", hash = "sha256:558bc0c4145f01e6405f4a5fdbd82050bd221b119f4bf72a961a1cfd471349d6"}, ] [[package]] @@ -2611,4 +2610,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "a0bd73376a3e9479f2379265ccec8dd6ac9df2e525909d12b77d918d590fba55" +content-hash = "c6c217033f50430c31b0979b74db222e6bab2301abd8b9f0cce5a9d5bccc578f" diff --git a/proxy/README.md b/proxy/README.md index cd76a2443f..d1f2e3f27b 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -93,6 +93,15 @@ With the current approach we made the following design decisions: and column oids. Command tag capturing was added to the rust-postgres functionality as part of this change. +### Output options + +User can pass several optional headers that will affect resulting json. + +1. `Neon-Raw-Text-Output: true`. Return postgres values as text, without parsing them. So numbers, objects, booleans, nulls and arrays will be returned as text. That can be useful in cases when client code wants to implement it's own parsing or reuse parsing libraries from e.g. node-postgres. +2. `Neon-Array-Mode: true`. Return postgres rows as arrays instead of objects. That is more compact representation and also helps in some edge +cases where it is hard to use rows represented as objects (e.g. when several fields have the same name). + + ## Using SNI-based routing on localhost Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy: diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 18bc80d523..9322e4f9ff 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -139,6 +139,16 @@ async fn auth_quirks( } impl BackendType<'_, ClientCredentials<'_>> { + /// Get compute endpoint name from the credentials. + pub fn get_endpoint(&self) -> Option { + use BackendType::*; + + match self { + Console(_, creds) => creds.project.clone(), + Postgres(_, creds) => creds.project.clone(), + Link(_) => Some("link".to_owned()), + } + } /// Authenticate the client via the requested backend, possibly using credentials. #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] pub async fn authenticate( diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index bba2d51caf..a5f50cc7c1 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -17,7 +17,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::sync::CancellationToken; use utils::{project_git_version, sentry_init::init_sentry}; -use tracing::{error, info, warn}; +use tracing::{error, info, warn, Instrument}; project_git_version!(GIT_VERSION); @@ -141,7 +141,6 @@ async fn task_main( tokio::select! { accept_result = listener.accept() => { let (socket, peer_addr) = accept_result?; - info!("accepted postgres client connection from {peer_addr}"); let session_id = uuid::Uuid::new_v4(); let tls_config = Arc::clone(&tls_config); @@ -149,18 +148,18 @@ async fn task_main( connections.spawn( async move { - info!("spawned a task for {peer_addr}"); - socket .set_nodelay(true) .context("failed to set socket option")?; - handle_client(dest_suffix, tls_config, session_id, socket).await + info!(%peer_addr, "serving"); + handle_client(dest_suffix, tls_config, socket).await } .unwrap_or_else(|e| { // Acknowledge that the task has finished with an error. error!("per-client task finished with an error: {e:#}"); - }), + }) + .instrument(tracing::info_span!("handle_client", ?session_id)) ); } _ = cancellation_token.cancelled() => { @@ -192,7 +191,6 @@ async fn ssl_handshake( let mut stream = PqStream::new(Stream::from_raw(raw_stream)); let msg = stream.read_startup_packet().await?; - info!("received {msg:?}"); use pq_proto::FeStartupPacket::*; match msg { @@ -215,15 +213,19 @@ async fn ssl_handshake( } Ok(raw.upgrade(tls_config).await?) } - _ => stream.throw_error_str(ERR_INSECURE_CONNECTION).await?, + unexpected => { + info!( + ?unexpected, + "unexpected startup packet, rejecting connection" + ); + stream.throw_error_str(ERR_INSECURE_CONNECTION).await? + } } } -#[tracing::instrument(fields(session_id = ?session_id), skip_all)] async fn handle_client( dest_suffix: Arc, tls_config: Arc, - session_id: uuid::Uuid, stream: impl AsyncRead + AsyncWrite + Unpin, ) -> anyhow::Result<()> { let tls_stream = ssl_handshake(stream, tls_config).await?; diff --git a/proxy/src/console.rs b/proxy/src/console.rs index 1f3ef99555..0e5eaaf845 100644 --- a/proxy/src/console.rs +++ b/proxy/src/console.rs @@ -1,5 +1,5 @@ -///! Various stuff for dealing with the Neon Console. -///! Later we might move some API wrappers here. +//! Various stuff for dealing with the Neon Console. +//! Later we might move some API wrappers here. /// Payloads used in the console's APIs. pub mod messages; diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs index 0438a82c12..050f00dd7d 100644 --- a/proxy/src/http/sql_over_http.rs +++ b/proxy/src/http/sql_over_http.rs @@ -1,6 +1,8 @@ use futures::pin_mut; use futures::StreamExt; use hyper::body::HttpBody; +use hyper::http::HeaderName; +use hyper::http::HeaderValue; use hyper::{Body, HeaderMap, Request}; use pq_proto::StartupMessageParams; use serde_json::json; @@ -23,21 +25,28 @@ const APP_NAME: &str = "sql_over_http"; const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB +static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); +static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); +static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true"); + // // Convert json non-string types to strings, so that they can be passed to Postgres // as parameters. // -fn json_to_pg_text(json: Vec) -> Result, serde_json::Error> { +fn json_to_pg_text(json: Vec) -> Result>, serde_json::Error> { json.iter() .map(|value| { match value { - Value::Null => serde_json::to_string(value), - Value::Bool(_) => serde_json::to_string(value), - Value::Number(_) => serde_json::to_string(value), - Value::Object(_) => serde_json::to_string(value), + // special care for nulls + Value::Null => Ok(None), - // no need to escape - Value::String(s) => Ok(s.to_string()), + // convert to text with escaping + Value::Bool(_) => serde_json::to_string(value).map(Some), + Value::Number(_) => serde_json::to_string(value).map(Some), + Value::Object(_) => serde_json::to_string(value).map(Some), + + // avoid escaping here, as we pass this as a parameter + Value::String(s) => Ok(Some(s.to_string())), // special care for arrays Value::Array(_) => json_array_to_pg_array(value), @@ -54,25 +63,29 @@ fn json_to_pg_text(json: Vec) -> Result, serde_json::Error> { // // Example of the same escaping in node-postgres: packages/pg/lib/utils.js // -fn json_array_to_pg_array(value: &Value) -> Result { +fn json_array_to_pg_array(value: &Value) -> Result, serde_json::Error> { match value { - // same - Value::Null => serde_json::to_string(value), - Value::Bool(_) => serde_json::to_string(value), - Value::Number(_) => serde_json::to_string(value), - Value::Object(_) => serde_json::to_string(value), + // special care for nulls + Value::Null => Ok(None), - // now needs to be escaped, as it is part of the array - Value::String(_) => serde_json::to_string(value), + // convert to text with escaping + Value::Bool(_) => serde_json::to_string(value).map(Some), + Value::Number(_) => serde_json::to_string(value).map(Some), + Value::Object(_) => serde_json::to_string(value).map(Some), + + // here string needs to be escaped, as it is part of the array + Value::String(_) => serde_json::to_string(value).map(Some), // recurse into array Value::Array(arr) => { let vals = arr .iter() .map(json_array_to_pg_array) + .map(|r| r.map(|v| v.unwrap_or_else(|| "NULL".to_string()))) .collect::, _>>()? .join(","); - Ok(format!("{{{}}}", vals)) + + Ok(Some(format!("{{{}}}", vals))) } } } @@ -158,6 +171,11 @@ pub async fn handle( ("application_name", APP_NAME), ]); + // Determine the output options. Default behaviour is 'false'. Anything that is not + // strictly 'true' assumed to be false. + let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); + let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); + // // Wake up the destination if needed. Code here is a bit involved because // we reuse the code from the usual proxy and we need to prepare few structures @@ -272,7 +290,7 @@ pub async fn handle( // convert rows to JSON let rows = rows .iter() - .map(pg_text_row_to_json) + .map(|row| pg_text_row_to_json(row, raw_output, array_mode)) .collect::, _>>()?; // resulting JSON format is based on the format of node-postgres result @@ -281,26 +299,42 @@ pub async fn handle( "rowCount": command_tag_count, "rows": rows, "fields": fields, + "rowAsArray": array_mode, })) } // // Convert postgres row with text-encoded values to JSON object // -pub fn pg_text_row_to_json(row: &Row) -> Result { - let res = row - .columns() - .iter() - .enumerate() - .map(|(i, column)| { - let name = column.name(); - let pg_value = row.as_text(i)?; - let json_value = pg_text_to_json(pg_value, column.type_())?; - Ok((name.to_string(), json_value)) - }) - .collect::, anyhow::Error>>()?; +pub fn pg_text_row_to_json( + row: &Row, + raw_output: bool, + array_mode: bool, +) -> Result { + let iter = row.columns().iter().enumerate().map(|(i, column)| { + let name = column.name(); + let pg_value = row.as_text(i)?; + let json_value = if raw_output { + match pg_value { + Some(v) => Value::String(v.to_string()), + None => Value::Null, + } + } else { + pg_text_to_json(pg_value, column.type_())? + }; + Ok((name.to_string(), json_value)) + }); - Ok(Value::Object(res)) + if array_mode { + // drop keys and aggregate into array + let arr = iter + .map(|r| r.map(|(_key, val)| val)) + .collect::, anyhow::Error>>()?; + Ok(Value::Array(arr)) + } else { + let obj = iter.collect::, anyhow::Error>>()?; + Ok(Value::Object(obj)) + } } // @@ -308,10 +342,6 @@ pub fn pg_text_row_to_json(row: &Row) -> Result { // pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result { if let Some(val) = pg_value { - if val == "NULL" { - return Ok(Value::Null); - } - if let Kind::Array(elem_type) = pg_type.kind() { return pg_array_parse(val, elem_type); } @@ -373,6 +403,27 @@ fn _pg_array_parse( } } + fn push_checked( + entry: &mut String, + entries: &mut Vec, + elem_type: &Type, + ) -> Result<(), anyhow::Error> { + if !entry.is_empty() { + // While in usual postgres response we get nulls as None and everything else + // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while + // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs + // here while we have quotation info and convert them to None. + if entry == "NULL" { + entries.push(pg_text_to_json(None, elem_type)?); + } else { + entries.push(pg_text_to_json(Some(entry), elem_type)?); + } + entry.clear(); + } + + Ok(()) + } + while let Some((mut i, mut c)) = pg_array_chr.next() { let mut escaped = false; @@ -395,9 +446,7 @@ fn _pg_array_parse( '}' => { level -= 1; if level == 0 { - if !entry.is_empty() { - entries.push(pg_text_to_json(Some(&entry), elem_type)?); - } + push_checked(&mut entry, &mut entries, elem_type)?; if nested { return Ok((Value::Array(entries), i)); } @@ -405,17 +454,15 @@ fn _pg_array_parse( } '"' if !escaped => { if quote { - // push even if empty + // end of quoted string, so push it manually without any checks + // for emptiness or nulls entries.push(pg_text_to_json(Some(&entry), elem_type)?); - entry = String::new(); + entry.clear(); } quote = !quote; } ',' if !quote => { - if !entry.is_empty() { - entries.push(pg_text_to_json(Some(&entry), elem_type)?); - entry = String::new(); - } + push_checked(&mut entry, &mut entries, elem_type)?; } _ => { entry.push(c); @@ -439,30 +486,35 @@ mod tests { fn test_atomic_types_to_pg_params() { let json = vec![Value::Bool(true), Value::Bool(false)]; let pg_params = json_to_pg_text(json).unwrap(); - assert_eq!(pg_params, vec!["true", "false"]); + assert_eq!( + pg_params, + vec![Some("true".to_owned()), Some("false".to_owned())] + ); let json = vec![Value::Number(serde_json::Number::from(42))]; let pg_params = json_to_pg_text(json).unwrap(); - assert_eq!(pg_params, vec!["42"]); + assert_eq!(pg_params, vec![Some("42".to_owned())]); let json = vec![Value::String("foo\"".to_string())]; let pg_params = json_to_pg_text(json).unwrap(); - assert_eq!(pg_params, vec!["foo\""]); + assert_eq!(pg_params, vec![Some("foo\"".to_owned())]); let json = vec![Value::Null]; let pg_params = json_to_pg_text(json).unwrap(); - assert_eq!(pg_params, vec!["null"]); + assert_eq!(pg_params, vec![None]); } #[test] fn test_json_array_to_pg_array() { // atoms and escaping - let json = "[true, false, null, 42, \"foo\", \"bar\\\"-\\\\\"]"; + let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]"; let json: Value = serde_json::from_str(json).unwrap(); let pg_params = json_to_pg_text(vec![json]).unwrap(); assert_eq!( pg_params, - vec!["{true,false,null,42,\"foo\",\"bar\\\"-\\\\\"}"] + vec![Some( + "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned() + )] ); // nested arrays @@ -471,7 +523,9 @@ mod tests { let pg_params = json_to_pg_text(vec![json]).unwrap(); assert_eq!( pg_params, - vec!["{{true,false},{null,42},{\"foo\",\"bar\\\"-\\\\\"}}"] + vec![Some( + "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned() + )] ); } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index f3d3524d30..cf2dd000db 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -455,6 +455,9 @@ impl<'a, S> Client<'a, S> { impl Client<'_, S> { /// Let the client authenticate and connect to the designated compute node. + // Instrumentation logs endpoint name everywhere. Doesn't work for link + // auth; strictly speaking we don't know endpoint name in its case. + #[tracing::instrument(name = "", fields(ep = self.creds.get_endpoint().unwrap_or("".to_owned())), skip_all)] async fn connect_to_db( self, session: cancellation::Session<'_>, diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 60acb588dc..3373c49676 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -1,4 +1,4 @@ -///! A group of high-level tests for connection establishing logic and auth. +//! A group of high-level tests for connection establishing logic and auth. use super::*; use crate::{auth, sasl, scram}; use async_trait::async_trait; diff --git a/pyproject.toml b/pyproject.toml index 574d247bf0..2c21af6982 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,40 +6,41 @@ authors = [] [tool.poetry.dependencies] python = "^3.9" -pytest = "^6.2.5" +pytest = "^7.3.1" psycopg2-binary = "^2.9.1" -typing-extensions = "^4.1.0" +typing-extensions = "^4.6.1" PyJWT = {version = "^2.1.0", extras = ["crypto"]} requests = "^2.31.0" -pytest-xdist = "^3.0.2" +pytest-xdist = "^3.3.1" asyncpg = "^0.27.0" aiopg = "^1.3.1" Jinja2 = "^3.0.2" -types-requests = "^2.28.5" -types-psycopg2 = "^2.9.18" +types-requests = "^2.31.0.0" +types-psycopg2 = "^2.9.21.10" boto3 = "^1.26.16" boto3-stubs = {extras = ["s3"], version = "^1.26.16"} moto = {extras = ["server"], version = "^4.1.2"} -backoff = "^1.11.1" +backoff = "^2.2.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" Werkzeug = "^2.2.3" -pytest-order = "^1.0.1" -allure-pytest = "^2.13.1" -pytest-asyncio = "^0.19.0" +pytest-order = "^1.1.0" +allure-pytest = "^2.13.2" +pytest-asyncio = "^0.21.0" toml = "^0.10.2" psutil = "^5.9.4" -types-psutil = "^5.9.5.4" -types-toml = "^0.10.8" -pytest-httpserver = "^1.0.6" +types-psutil = "^5.9.5.12" +types-toml = "^0.10.8.6" +pytest-httpserver = "^1.0.8" aiohttp = "3.7.4" pytest-rerunfailures = "^11.1.2" +types-pytest-lazy-fixture = "^0.6.3.3" [tool.poetry.group.dev.dependencies] -black = "^23.1.0" -mypy = "==1.1.1" -ruff = "^0.0.255" +black = "^23.3.0" +mypy = "==1.3.0" +ruff = "^0.0.269" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index fecbb8bd41..0625538bf3 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -3,15 +3,19 @@ // use anyhow::{bail, Context, Result}; use clap::Parser; +use futures::future::BoxFuture; +use futures::stream::FuturesUnordered; +use futures::{FutureExt, StreamExt}; use remote_storage::RemoteStorageConfig; +use tokio::runtime::Handle; +use tokio::signal::unix::{signal, SignalKind}; +use tokio::task::JoinError; use toml_edit::Document; -use utils::signals::ShutdownSignals; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; use std::sync::Arc; -use std::thread; use std::time::Duration; use storage_broker::Uri; use tokio::sync::mpsc; @@ -20,22 +24,21 @@ use tracing::*; use utils::pid_file; use metrics::set_build_info_metric; -use safekeeper::broker; -use safekeeper::control_file; use safekeeper::defaults::{ DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PG_LISTEN_ADDR, }; -use safekeeper::http; -use safekeeper::remove_wal; -use safekeeper::wal_backup; use safekeeper::wal_service; use safekeeper::GlobalTimelines; use safekeeper::SafeKeeperConf; +use safekeeper::{broker, WAL_SERVICE_RUNTIME}; +use safekeeper::{control_file, BROKER_RUNTIME}; +use safekeeper::{http, WAL_REMOVER_RUNTIME}; +use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME}; +use safekeeper::{wal_backup, HTTP_RUNTIME}; use storage_broker::DEFAULT_ENDPOINT; use utils::auth::JwtAuth; use utils::{ - http::endpoint, id::NodeId, logging::{self, LogFormat}, project_git_version, @@ -104,10 +107,6 @@ struct Args { /// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes #[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)] max_offloader_lag: u64, - /// Number of threads for wal backup runtime, by default number of cores - /// available to the system. - #[arg(long)] - wal_backup_threads: Option, /// Number of max parallel WAL segments to be offloaded to remote storage. #[arg(long, default_value = "5")] wal_backup_parallel_jobs: usize, @@ -121,9 +120,14 @@ struct Args { /// Format for logging, either 'plain' or 'json'. #[arg(long, default_value = "plain")] log_format: String, + /// Run everything in single threaded current thread runtime, might be + /// useful for debugging. + #[arg(long)] + current_thread_runtime: bool, } -fn main() -> anyhow::Result<()> { +#[tokio::main(flavor = "current_thread")] +async fn main() -> anyhow::Result<()> { let args = Args::parse(); if let Some(addr) = args.dump_control_file { @@ -183,10 +187,10 @@ fn main() -> anyhow::Result<()> { heartbeat_timeout: args.heartbeat_timeout, remote_storage: args.remote_storage, max_offloader_lag_bytes: args.max_offloader_lag, - backup_runtime_threads: args.wal_backup_threads, wal_backup_enabled: !args.disable_wal_backup, backup_parallel_jobs: args.wal_backup_parallel_jobs, auth, + current_thread_runtime: args.current_thread_runtime, }; // initialize sentry if SENTRY_DSN is provided @@ -194,10 +198,14 @@ fn main() -> anyhow::Result<()> { Some(GIT_VERSION.into()), &[("node_id", &conf.my_id.to_string())], ); - start_safekeeper(conf) + start_safekeeper(conf).await } -fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { +/// Result of joining any of main tasks: upper error means task failed to +/// complete, e.g. panicked, inner is error produced by task itself. +type JoinTaskRes = Result, JoinError>; + +async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { // Prevent running multiple safekeepers on the same directory let lock_file_path = conf.workdir.join(PID_FILE_NAME); let lock_file = @@ -208,14 +216,18 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { // we need to release the lock file only when the current process is gone std::mem::forget(lock_file); - let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| { - error!("failed to bind to address {}: {}", conf.listen_http_addr, e); + info!("starting safekeeper WAL service on {}", conf.listen_pg_addr); + let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| { + error!("failed to bind to address {}: {}", conf.listen_pg_addr, e); e })?; - info!("starting safekeeper on {}", conf.listen_pg_addr); - let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| { - error!("failed to bind to address {}: {}", conf.listen_pg_addr, e); + info!( + "starting safekeeper HTTP service on {}", + conf.listen_http_addr + ); + let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| { + error!("failed to bind to address {}: {}", conf.listen_http_addr, e); e })?; @@ -224,71 +236,88 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let timeline_collector = safekeeper::metrics::TimelineCollector::new(); metrics::register_internal(Box::new(timeline_collector))?; - let mut threads = vec![]; let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); // Load all timelines from disk to memory. GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?; - let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("http_endpoint_thread".into()) - .spawn(|| { - let router = http::make_router(conf_); - endpoint::serve_thread_main( - router, - http_listener, - std::future::pending(), // never shut down - ) - .unwrap(); - })?, - ); - - let conf_cloned = conf.clone(); - let safekeeper_thread = thread::Builder::new() - .name("WAL service thread".into()) - .spawn(|| wal_service::thread_main(conf_cloned, pg_listener)) - .unwrap(); - - threads.push(safekeeper_thread); + // Keep handles to main tasks to die if any of them disappears. + let mut tasks_handles: FuturesUnordered> = + FuturesUnordered::new(); let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("broker thread".into()) - .spawn(|| { - broker::thread_main(conf_); - })?, - ); + // Run everything in current thread rt, if asked. + if conf.current_thread_runtime { + info!("running in current thread runtime"); + } + let current_thread_rt = conf + .current_thread_runtime + .then(|| Handle::try_current().expect("no runtime in main")); + let wal_service_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle()) + .spawn(wal_service::task_main(conf_, pg_listener)) + // wrap with task name for error reporting + .map(|res| ("WAL service main".to_owned(), res)); + tasks_handles.push(Box::pin(wal_service_handle)); let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("WAL removal thread".into()) - .spawn(|| { - remove_wal::thread_main(conf_); - })?, - ); + let http_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| HTTP_RUNTIME.handle()) + .spawn(http::task_main(conf_, http_listener)) + .map(|res| ("HTTP service main".to_owned(), res)); + tasks_handles.push(Box::pin(http_handle)); - threads.push( - thread::Builder::new() - .name("WAL backup launcher thread".into()) - .spawn(move || { - wal_backup::wal_backup_launcher_thread_main(conf, wal_backup_launcher_rx); - })?, - ); + let conf_ = conf.clone(); + let broker_task_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| BROKER_RUNTIME.handle()) + .spawn(broker::task_main(conf_).instrument(info_span!("broker"))) + .map(|res| ("broker main".to_owned(), res)); + tasks_handles.push(Box::pin(broker_task_handle)); + + let conf_ = conf.clone(); + let wal_remover_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle()) + .spawn(remove_wal::task_main(conf_)) + .map(|res| ("WAL remover".to_owned(), res)); + tasks_handles.push(Box::pin(wal_remover_handle)); + + let conf_ = conf.clone(); + let wal_backup_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle()) + .spawn(wal_backup::wal_backup_launcher_task_main( + conf_, + wal_backup_launcher_rx, + )) + .map(|res| ("WAL backup launcher".to_owned(), res)); + tasks_handles.push(Box::pin(wal_backup_handle)); set_build_info_metric(GIT_VERSION); - // TODO: put more thoughts into handling of failed threads - // We should catch & die if they are in trouble. - // On any shutdown signal, log receival and exit. Additionally, handling - // SIGQUIT prevents coredump. - ShutdownSignals::handle(|signal| { - info!("received {}, terminating", signal.name()); - std::process::exit(0); - }) + // TODO: update tokio-stream, convert to real async Stream with + // SignalStream, map it to obtain missing signal name, combine streams into + // single stream we can easily sit on. + let mut sigquit_stream = signal(SignalKind::quit())?; + let mut sigint_stream = signal(SignalKind::interrupt())?; + let mut sigterm_stream = signal(SignalKind::terminate())?; + + tokio::select! { + Some((task_name, res)) = tasks_handles.next()=> { + error!("{} task failed: {:?}, exiting", task_name, res); + std::process::exit(1); + } + // On any shutdown signal, log receival and exit. Additionally, handling + // SIGQUIT prevents coredump. + _ = sigquit_stream.recv() => info!("received SIGQUIT, terminating"), + _ = sigint_stream.recv() => info!("received SIGINT, terminating"), + _ = sigterm_stream.recv() => info!("received SIGTERM, terminating") + + }; + std::process::exit(0); } /// Determine safekeeper id. diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 5e25d22ec1..2b1db2714b 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -8,7 +8,7 @@ use anyhow::Error; use anyhow::Result; use storage_broker::parse_proto_ttid; -use storage_broker::proto::broker_service_client::BrokerServiceClient; + use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; use storage_broker::proto::SubscribeSafekeeperInfoRequest; use storage_broker::Request; @@ -16,34 +16,23 @@ use storage_broker::Request; use std::time::Duration; use std::time::Instant; use tokio::task::JoinHandle; -use tokio::{runtime, time::sleep}; +use tokio::time::sleep; use tracing::*; +use crate::metrics::BROKER_ITERATION_TIMELINES; use crate::metrics::BROKER_PULLED_UPDATES; use crate::metrics::BROKER_PUSHED_UPDATES; +use crate::metrics::BROKER_PUSH_ALL_UPDATES_SECONDS; use crate::GlobalTimelines; use crate::SafeKeeperConf; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; -pub fn thread_main(conf: SafeKeeperConf) { - let runtime = runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - - let _enter = info_span!("broker").entered(); - info!("started, broker endpoint {:?}", conf.broker_endpoint); - - runtime.block_on(async { - main_loop(conf).await; - }); -} - /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { - let mut client = BrokerServiceClient::connect(conf.broker_endpoint.clone()).await?; + let mut client = + storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?; let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); let outbound = async_stream::stream! { @@ -53,16 +42,29 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // sensitive and there is no risk of deadlock as we don't await while // lock is held. let now = Instant::now(); - let mut active_tlis = GlobalTimelines::get_all(); - active_tlis.retain(|tli| tli.is_active()); - for tli in &active_tlis { - let sk_info = tli.get_safekeeper_info(&conf); + let all_tlis = GlobalTimelines::get_all(); + let mut n_pushed_tlis = 0; + for tli in &all_tlis { + // filtering alternative futures::stream::iter(all_tlis) + // .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::>().await; + // doesn't look better, and I'm not sure how to do that without collect. + if !tli.is_active().await { + continue; + } + let sk_info = tli.get_safekeeper_info(&conf).await; yield sk_info; BROKER_PUSHED_UPDATES.inc(); + n_pushed_tlis += 1; } let elapsed = now.elapsed(); - // Log duration every second. Should be about 10MB of logs per day. - info!("pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed); + + BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64()); + BROKER_ITERATION_TIMELINES.observe(n_pushed_tlis as f64); + + if elapsed > push_interval / 2 { + info!("broker push is too long, pushed {} timeline updates to broker in {:?}", n_pushed_tlis, elapsed); + } + sleep(push_interval).await; } }; @@ -117,10 +119,13 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { bail!("end of stream"); } -async fn main_loop(conf: SafeKeeperConf) { +pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { + info!("started, broker endpoint {:?}", conf.broker_endpoint); + let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC)); let mut push_handle: Option>> = None; let mut pull_handle: Option>> = None; + // Selecting on JoinHandles requires some squats; is there a better way to // reap tasks individually? diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index ba5e453e41..6c4ad24323 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -2,11 +2,13 @@ use anyhow::{bail, ensure, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use tokio::fs::{self, File}; +use tokio::io::AsyncWriteExt; -use std::fs::{self, File, OpenOptions}; -use std::io::{Read, Write}; +use std::io::Read; use std::ops::Deref; use std::path::{Path, PathBuf}; +use std::time::Instant; use crate::control_file_upgrade::upgrade_control_file; use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; @@ -25,9 +27,13 @@ pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); /// Storage should keep actual state inside of it. It should implement Deref /// trait to access state fields and have persist method for updating that state. +#[async_trait::async_trait] pub trait Storage: Deref { /// Persist safekeeper state on disk and update internal state. - fn persist(&mut self, s: &SafeKeeperState) -> Result<()>; + async fn persist(&mut self, s: &SafeKeeperState) -> Result<()>; + + /// Timestamp of last persist. + fn last_persist_at(&self) -> Instant; } #[derive(Debug)] @@ -38,6 +44,8 @@ pub struct FileStorage { /// Last state persisted to disk. state: SafeKeeperState, + /// Not preserved across restarts. + last_persist_at: Instant, } impl FileStorage { @@ -51,6 +59,7 @@ impl FileStorage { timeline_dir, conf: conf.clone(), state, + last_persist_at: Instant::now(), }) } @@ -66,6 +75,7 @@ impl FileStorage { timeline_dir, conf: conf.clone(), state, + last_persist_at: Instant::now(), }; Ok(store) @@ -74,7 +84,7 @@ impl FileStorage { /// Check the magic/version in the on-disk data and deserialize it, if possible. fn deser_sk_state(buf: &mut &[u8]) -> Result { // Read the version independent part - let magic = buf.read_u32::()?; + let magic = ReadBytesExt::read_u32::(buf)?; if magic != SK_MAGIC { bail!( "bad control file magic: {:X}, expected {:X}", @@ -82,7 +92,7 @@ impl FileStorage { SK_MAGIC ); } - let version = buf.read_u32::()?; + let version = ReadBytesExt::read_u32::(buf)?; if version == SK_FORMAT_VERSION { let res = SafeKeeperState::des(buf)?; return Ok(res); @@ -102,7 +112,7 @@ impl FileStorage { /// Read in the control file. pub fn load_control_file>(control_file_path: P) -> Result { - let mut control_file = OpenOptions::new() + let mut control_file = std::fs::OpenOptions::new() .read(true) .write(true) .open(&control_file_path) @@ -151,30 +161,31 @@ impl Deref for FileStorage { } } +#[async_trait::async_trait] impl Storage for FileStorage { /// persists state durably to underlying storage /// for description see https://lwn.net/Articles/457667/ - fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { + async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer(); // write data to safekeeper.control.partial let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL); - let mut control_partial = File::create(&control_partial_path).with_context(|| { + let mut control_partial = File::create(&control_partial_path).await.with_context(|| { format!( "failed to create partial control file at: {}", &control_partial_path.display() ) })?; let mut buf: Vec = Vec::new(); - buf.write_u32::(SK_MAGIC)?; - buf.write_u32::(SK_FORMAT_VERSION)?; + WriteBytesExt::write_u32::(&mut buf, SK_MAGIC)?; + WriteBytesExt::write_u32::(&mut buf, SK_FORMAT_VERSION)?; s.ser_into(&mut buf)?; // calculate checksum before resize let checksum = crc32c::crc32c(&buf); buf.extend_from_slice(&checksum.to_le_bytes()); - control_partial.write_all(&buf).with_context(|| { + control_partial.write_all(&buf).await.with_context(|| { format!( "failed to write safekeeper state into control file at: {}", control_partial_path.display() @@ -183,7 +194,7 @@ impl Storage for FileStorage { // fsync the file if !self.conf.no_sync { - control_partial.sync_all().with_context(|| { + control_partial.sync_all().await.with_context(|| { format!( "failed to sync partial control file at {}", control_partial_path.display() @@ -194,21 +205,22 @@ impl Storage for FileStorage { let control_path = self.timeline_dir.join(CONTROL_FILE_NAME); // rename should be atomic - fs::rename(&control_partial_path, &control_path)?; + fs::rename(&control_partial_path, &control_path).await?; // this sync is not required by any standard but postgres does this (see durable_rename) if !self.conf.no_sync { - File::open(&control_path) - .and_then(|f| f.sync_all()) - .with_context(|| { - format!( - "failed to sync control file at: {}", - &control_path.display() - ) - })?; + let new_f = File::open(&control_path).await?; + new_f.sync_all().await.with_context(|| { + format!( + "failed to sync control file at: {}", + &control_path.display() + ) + })?; // fsync the directory (linux specific) - File::open(&self.timeline_dir) - .and_then(|f| f.sync_all()) + let tli_dir = File::open(&self.timeline_dir).await?; + tli_dir + .sync_all() + .await .context("failed to sync control file directory")?; } @@ -216,6 +228,10 @@ impl Storage for FileStorage { self.state = s.clone(); Ok(()) } + + fn last_persist_at(&self) -> Instant { + self.last_persist_at + } } #[cfg(test)] @@ -224,7 +240,6 @@ mod test { use super::*; use crate::{safekeeper::SafeKeeperState, SafeKeeperConf}; use anyhow::Result; - use std::fs; use utils::{id::TenantTimelineId, lsn::Lsn}; fn stub_conf() -> SafeKeeperConf { @@ -235,59 +250,75 @@ mod test { } } - fn load_from_control_file( + async fn load_from_control_file( conf: &SafeKeeperConf, ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir"); + fs::create_dir_all(conf.timeline_dir(ttid)) + .await + .expect("failed to create timeline dir"); Ok(( FileStorage::restore_new(ttid, conf)?, FileStorage::load_control_file_conf(conf, ttid)?, )) } - fn create( + async fn create( conf: &SafeKeeperConf, ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir"); + fs::create_dir_all(conf.timeline_dir(ttid)) + .await + .expect("failed to create timeline dir"); let state = SafeKeeperState::empty(); let storage = FileStorage::create_new(ttid, conf, state.clone())?; Ok((storage, state)) } - #[test] - fn test_read_write_safekeeper_state() { + #[tokio::test] + async fn test_read_write_safekeeper_state() { let conf = stub_conf(); let ttid = TenantTimelineId::generate(); { - let (mut storage, mut state) = create(&conf, &ttid).expect("failed to create state"); + let (mut storage, mut state) = + create(&conf, &ttid).await.expect("failed to create state"); // change something state.commit_lsn = Lsn(42); - storage.persist(&state).expect("failed to persist state"); + storage + .persist(&state) + .await + .expect("failed to persist state"); } - let (_, state) = load_from_control_file(&conf, &ttid).expect("failed to read state"); + let (_, state) = load_from_control_file(&conf, &ttid) + .await + .expect("failed to read state"); assert_eq!(state.commit_lsn, Lsn(42)); } - #[test] - fn test_safekeeper_state_checksum_mismatch() { + #[tokio::test] + async fn test_safekeeper_state_checksum_mismatch() { let conf = stub_conf(); let ttid = TenantTimelineId::generate(); { - let (mut storage, mut state) = create(&conf, &ttid).expect("failed to read state"); + let (mut storage, mut state) = + create(&conf, &ttid).await.expect("failed to read state"); // change something state.commit_lsn = Lsn(42); - storage.persist(&state).expect("failed to persist state"); + storage + .persist(&state) + .await + .expect("failed to persist state"); } let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME); - let mut data = fs::read(&control_path).unwrap(); + let mut data = fs::read(&control_path).await.unwrap(); data[0] += 1; // change the first byte of the file to fail checksum validation - fs::write(&control_path, &data).expect("failed to write control file"); + fs::write(&control_path, &data) + .await + .expect("failed to write control file"); - match load_from_control_file(&conf, &ttid) { + match load_from_control_file(&conf, &ttid).await { Err(err) => assert!(err .to_string() .contains("safekeeper control file checksum mismatch")), diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index f711c4429d..387b577a13 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -121,7 +121,7 @@ pub struct FileInfo { } /// Build debug dump response, using the provided [`Args`] filters. -pub fn build(args: Args) -> Result { +pub async fn build(args: Args) -> Result { let start_time = Utc::now(); let timelines_count = GlobalTimelines::timelines_count(); @@ -155,7 +155,7 @@ pub fn build(args: Args) -> Result { } let control_file = if args.dump_control_file { - let mut state = tli.get_state().1; + let mut state = tli.get_state().await.1; if !args.dump_term_history { state.acceptor_state.term_history = TermHistory(vec![]); } @@ -165,7 +165,7 @@ pub fn build(args: Args) -> Result { }; let memory = if args.dump_memory { - Some(tli.memory_dump()) + Some(tli.memory_dump().await) } else { None }; diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 7d25ced449..1367d5eebb 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -256,14 +256,14 @@ impl SafekeeperPostgresHandler { let lsn = if self.is_walproposer_recovery() { // walproposer should get all local WAL until flush_lsn - tli.get_flush_lsn() + tli.get_flush_lsn().await } else { // other clients shouldn't get any uncommitted WAL - tli.get_state().0.commit_lsn + tli.get_state().await.0.commit_lsn } .to_string(); - let sysid = tli.get_state().1.server.system_id.to_string(); + let sysid = tli.get_state().await.1.server.system_id.to_string(); let lsn_bytes = lsn.as_bytes(); let tli = PG_TLI.to_string(); let tli_bytes = tli.as_bytes(); diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 1831470007..2a9570595f 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -2,3 +2,18 @@ pub mod routes; pub use routes::make_router; pub use safekeeper_api::models; + +use crate::SafeKeeperConf; + +pub async fn task_main( + conf: SafeKeeperConf, + http_listener: std::net::TcpListener, +) -> anyhow::Result<()> { + let router = make_router(conf) + .build() + .map_err(|err| anyhow::anyhow!(err))?; + let service = utils::http::RouterService::new(router).unwrap(); + let server = hyper::Server::from_tcp(http_listener)?; + server.serve(service).await?; + Ok(()) // unreachable +} diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index a498d868af..5cd0973ad6 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -13,7 +13,7 @@ use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use tokio::fs::File; use tokio::io::AsyncReadExt; -use tokio::task::JoinError; +use utils::http::endpoint::request_span; use crate::safekeeper::ServerInfo; use crate::safekeeper::Term; @@ -116,8 +116,8 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result timeline_id, }; - let resp = tokio::task::spawn_blocking(move || { - debug_dump::build(args).map_err(ApiError::InternalServerError) - }) - .await - .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; + let resp = debug_dump::build(args) + .await + .map_err(ApiError::InternalServerError)?; // TODO: use streaming response json_response(StatusCode::OK, resp) @@ -386,29 +379,32 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder router .data(Arc::new(conf)) .data(auth) - .get("/v1/status", status_handler) + .get("/v1/status", |r| request_span(r, status_handler)) // Will be used in the future instead of implicit timeline creation - .post("/v1/tenant/timeline", timeline_create_handler) - .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id", - timeline_status_handler, - ) - .delete( - "/v1/tenant/:tenant_id/timeline/:timeline_id", - timeline_delete_force_handler, - ) - .delete("/v1/tenant/:tenant_id", tenant_delete_force_handler) - .post("/v1/pull_timeline", timeline_pull_handler) + .post("/v1/tenant/timeline", |r| { + request_span(r, timeline_create_handler) + }) + .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + request_span(r, timeline_status_handler) + }) + .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + request_span(r, timeline_delete_force_handler) + }) + .delete("/v1/tenant/:tenant_id", |r| { + request_span(r, tenant_delete_force_handler) + }) + .post("/v1/pull_timeline", |r| { + request_span(r, timeline_pull_handler) + }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename", - timeline_files_handler, + |r| request_span(r, timeline_files_handler), ) // for tests - .post( - "/v1/record_safekeeper_info/:tenant_id/:timeline_id", - record_safekeeper_info, - ) - .get("/v1/debug_dump", dump_debug_handler) + .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| { + request_span(r, record_safekeeper_info) + }) + .get("/v1/debug_dump", |r| request_span(r, dump_debug_handler)) } #[cfg(test)] diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index dc9188723e..14d0cc3653 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -73,12 +73,12 @@ pub async fn handle_json_ctrl( // if send_proposer_elected is true, we need to update local history if append_request.send_proposer_elected { - send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn)?; + send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn).await?; } - let inserted_wal = append_logical_message(&tli, append_request)?; + let inserted_wal = append_logical_message(&tli, append_request).await?; let response = AppendResult { - state: tli.get_state().1, + state: tli.get_state().await.1, inserted_wal, }; let response_data = serde_json::to_vec(&response) @@ -114,9 +114,9 @@ async fn prepare_safekeeper( .await } -fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::Result<()> { +async fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::Result<()> { // add new term to existing history - let history = tli.get_state().1.acceptor_state.term_history; + let history = tli.get_state().await.1.acceptor_state.term_history; let history = history.up_to(lsn.checked_sub(1u64).unwrap()); let mut history_entries = history.0; history_entries.push(TermSwitchEntry { term, lsn }); @@ -129,7 +129,7 @@ fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::R timeline_start_lsn: lsn, }); - tli.process_msg(&proposer_elected_request)?; + tli.process_msg(&proposer_elected_request).await?; Ok(()) } @@ -142,12 +142,12 @@ pub struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. -pub fn append_logical_message( +pub async fn append_logical_message( tli: &Arc, msg: &AppendLogicalMessage, ) -> anyhow::Result { let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); - let sk_state = tli.get_state().1; + let sk_state = tli.get_state().await.1; let begin_lsn = msg.begin_lsn; let end_lsn = begin_lsn + wal_data.len() as u64; @@ -171,7 +171,7 @@ pub fn append_logical_message( wal_data: Bytes::from(wal_data), }); - let response = tli.process_msg(&append_request)?; + let response = tli.process_msg(&append_request).await?; let append_response = match response { Some(AcceptorProposerMessage::AppendResponse(resp)) => resp, diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 22d6d57e19..b8e1101369 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -1,4 +1,6 @@ +use once_cell::sync::Lazy; use remote_storage::RemoteStorageConfig; +use tokio::runtime::Runtime; use std::path::PathBuf; use std::time::Duration; @@ -36,7 +38,6 @@ pub mod defaults { DEFAULT_PG_LISTEN_PORT, }; - pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8; pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms"; pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20); } @@ -60,10 +61,10 @@ pub struct SafeKeeperConf { pub heartbeat_timeout: Duration, pub remote_storage: Option, pub max_offloader_lag_bytes: u64, - pub backup_runtime_threads: Option, pub backup_parallel_jobs: usize, pub wal_backup_enabled: bool, pub auth: Option>, + pub current_thread_runtime: bool, } impl SafeKeeperConf { @@ -92,12 +93,64 @@ impl SafeKeeperConf { .parse() .expect("failed to parse default broker endpoint"), broker_keepalive_interval: Duration::from_secs(5), - backup_runtime_threads: None, wal_backup_enabled: true, backup_parallel_jobs: 1, auth: None, heartbeat_timeout: Duration::new(5, 0), max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, + current_thread_runtime: false, } } } + +// Tokio runtimes. +pub static WAL_SERVICE_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("WAL service worker") + .enable_all() + .build() + .expect("Failed to create WAL service runtime") +}); + +pub static HTTP_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("HTTP worker") + .enable_all() + .build() + .expect("Failed to create WAL service runtime") +}); + +pub static BROKER_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("broker worker") + .worker_threads(2) // there are only 2 tasks, having more threads doesn't make sense + .enable_all() + .build() + .expect("Failed to create broker runtime") +}); + +pub static WAL_REMOVER_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("WAL remover") + .worker_threads(1) + .enable_all() + .build() + .expect("Failed to create broker runtime") +}); + +pub static WAL_BACKUP_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("WAL backup worker") + .enable_all() + .build() + .expect("Failed to create WAL backup runtime") +}); + +pub static METRICS_SHIFTER_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("metric shifter") + .worker_threads(1) + .enable_all() + .build() + .expect("Failed to create broker runtime") +}); diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 189af2b044..0711beb290 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -7,6 +7,7 @@ use std::{ use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS}; use anyhow::Result; +use futures::Future; use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, @@ -125,6 +126,25 @@ pub static BACKUP_ERRORS: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_backup_errors_total counter") }); +pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_broker_push_update_seconds", + "Seconds to push all timeline updates to the broker", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_broker_push_update_seconds histogram vec") +}); +pub const TIMELINES_COUNT_BUCKETS: &[f64] = &[ + 1.0, 10.0, 50.0, 100.0, 200.0, 500.0, 1000.0, 2000.0, 5000.0, 10000.0, 20000.0, 50000.0, +]; +pub static BROKER_ITERATION_TIMELINES: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_broker_iteration_timelines", + "Count of timelines pushed to the broker in a single iteration", + TIMELINES_COUNT_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_broker_iteration_timelines histogram vec") +}); pub const LABEL_UNKNOWN: &str = "unknown"; @@ -273,14 +293,17 @@ impl WalStorageMetrics { } } -/// Accepts a closure that returns a result, and returns the duration of the closure. -pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result { +/// Accepts async function that returns empty anyhow result, and returns the duration of its execution. +pub async fn time_io_closure>( + closure: impl Future>, +) -> Result { let start = std::time::Instant::now(); - closure()?; + closure.await.map_err(|e| e.into())?; Ok(start.elapsed().as_secs_f64()) } /// Metrics for a single timeline. +#[derive(Clone)] pub struct FullTimelineInfo { pub ttid: TenantTimelineId, pub ps_feedback: PageserverFeedback, @@ -556,13 +579,19 @@ impl Collector for TimelineCollector { let timelines = GlobalTimelines::get_all(); let timelines_count = timelines.len(); - for arc_tli in timelines { - let tli = arc_tli.info_for_metrics(); - if tli.is_none() { - continue; - } - let tli = tli.unwrap(); + // Prometheus Collector is sync, and data is stored under async lock. To + // bridge the gap with a crutch, collect data in spawned thread with + // local tokio runtime. + let infos = std::thread::spawn(|| { + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .expect("failed to create rt"); + rt.block_on(collect_timeline_metrics()) + }) + .join() + .expect("collect_timeline_metrics thread panicked"); + for tli in &infos { let tenant_id = tli.ttid.tenant_id.to_string(); let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; @@ -663,3 +692,15 @@ impl Collector for TimelineCollector { mfs } } + +async fn collect_timeline_metrics() -> Vec { + let mut res = vec![]; + let timelines = GlobalTimelines::get_all(); + + for tli in timelines { + if let Some(info) = tli.info_for_metrics().await { + res.push(info); + } + } + res +} diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 344b760fd3..61ba37efaa 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -231,7 +231,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result info!( "Loaded timeline {}, flush_lsn={}", ttid, - tli.get_flush_lsn() + tli.get_flush_lsn().await ); Ok(Response { diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 195470e3ca..a5e99c5f0a 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -18,15 +18,14 @@ use postgres_backend::QueryError; use pq_proto::BeMessage; use std::net::SocketAddr; use std::sync::Arc; -use std::thread; -use std::thread::JoinHandle; use tokio::io::AsyncRead; use tokio::io::AsyncWrite; use tokio::sync::mpsc::channel; use tokio::sync::mpsc::error::TryRecvError; use tokio::sync::mpsc::Receiver; use tokio::sync::mpsc::Sender; -use tokio::task::spawn_blocking; +use tokio::task; +use tokio::task::JoinHandle; use tokio::time::Duration; use tokio::time::Instant; use tracing::*; @@ -97,7 +96,7 @@ impl SafekeeperPostgresHandler { Err(res.expect_err("no error with WalAcceptor not spawn")) } Some(handle) => { - let wal_acceptor_res = handle.join(); + let wal_acceptor_res = handle.await; // If there was any network error, return it. res?; @@ -107,7 +106,7 @@ impl SafekeeperPostgresHandler { Ok(Ok(_)) => Ok(()), // can't happen currently; would be if we add graceful termination Ok(Err(e)) => Err(CopyStreamHandlerEnd::Other(e.context("WAL acceptor"))), Err(_) => Err(CopyStreamHandlerEnd::Other(anyhow!( - "WalAcceptor thread panicked", + "WalAcceptor task panicked", ))), } } @@ -154,10 +153,12 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { } }; - *self.acceptor_handle = Some( - WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, self.conn_id) - .context("spawn WalAcceptor thread")?, - ); + *self.acceptor_handle = Some(WalAcceptor::spawn( + tli.clone(), + msg_rx, + reply_tx, + self.conn_id, + )); // Forward all messages to WalAcceptor read_network_loop(self.pgb_reader, msg_tx, next_msg).await @@ -226,28 +227,19 @@ impl WalAcceptor { msg_rx: Receiver, reply_tx: Sender, conn_id: ConnectionId, - ) -> anyhow::Result>> { - let thread_name = format!("WAL acceptor {}", tli.ttid); - thread::Builder::new() - .name(thread_name) - .spawn(move || -> anyhow::Result<()> { - let mut wa = WalAcceptor { - tli, - msg_rx, - reply_tx, - }; + ) -> JoinHandle> { + task::spawn(async move { + let mut wa = WalAcceptor { + tli, + msg_rx, + reply_tx, + }; - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?; - - let span_ttid = wa.tli.ttid; // satisfy borrow checker - runtime.block_on( - wa.run() - .instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid)), - ) - }) - .map_err(anyhow::Error::from) + let span_ttid = wa.tli.ttid; // satisfy borrow checker + wa.run() + .instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid)) + .await + }) } /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed; @@ -281,7 +273,7 @@ impl WalAcceptor { while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg { let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); - if let Some(reply) = self.tli.process_msg(&noflush_msg)? { + if let Some(reply) = self.tli.process_msg(&noflush_msg).await? { if self.reply_tx.send(reply).await.is_err() { return Ok(()); // chan closed, streaming terminated } @@ -300,10 +292,12 @@ impl WalAcceptor { } // flush all written WAL to the disk - self.tli.process_msg(&ProposerAcceptorMessage::FlushWAL)? + self.tli + .process_msg(&ProposerAcceptorMessage::FlushWAL) + .await? } else { // process message other than AppendRequest - self.tli.process_msg(&next_msg)? + self.tli.process_msg(&next_msg).await? }; if let Some(reply) = reply_msg { @@ -326,8 +320,8 @@ impl Drop for ComputeConnectionGuard { let tli = self.timeline.clone(); // tokio forbids to call blocking_send inside the runtime, and see // comments in on_compute_disconnect why we call blocking_send. - spawn_blocking(move || { - if let Err(e) = tli.on_compute_disconnect() { + tokio::spawn(async move { + if let Err(e) = tli.on_compute_disconnect().await { error!("failed to unregister compute connection: {}", e); } }); diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index b6d497f34e..3306f0b63a 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -1,26 +1,36 @@ //! Thread removing old WAL. -use std::{thread, time::Duration}; +use std::time::Duration; +use tokio::time::sleep; use tracing::*; use crate::{GlobalTimelines, SafeKeeperConf}; -pub fn thread_main(conf: SafeKeeperConf) { +pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { let wal_removal_interval = Duration::from_millis(5000); loop { let tlis = GlobalTimelines::get_all(); for tli in &tlis { - if !tli.is_active() { + if !tli.is_active().await { continue; } let ttid = tli.ttid; - let _enter = - info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id).entered(); - if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { - warn!("failed to remove WAL: {}", e); + if let Err(e) = tli + .maybe_persist_control_file() + .instrument(info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id)) + .await + { + warn!("failed to persist control file: {e}"); + } + if let Err(e) = tli + .remove_old_wal(conf.wal_backup_enabled) + .instrument(info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id)) + .await + { + error!("failed to remove WAL: {}", e); } } - thread::sleep(wal_removal_interval) + sleep(wal_removal_interval).await; } } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 33da0c8e5a..d0b14a1282 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -10,6 +10,7 @@ use std::cmp::max; use std::cmp::min; use std::fmt; use std::io::Read; +use std::time::Duration; use storage_broker::proto::SafekeeperTimelineInfo; use tracing::*; @@ -567,25 +568,27 @@ where /// Process message from proposer and possibly form reply. Concurrent /// callers must exclude each other. - pub fn process_msg( + pub async fn process_msg( &mut self, msg: &ProposerAcceptorMessage, ) -> Result> { match msg { - ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg), - ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg), - ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg), - ProposerAcceptorMessage::AppendRequest(msg) => self.handle_append_request(msg, true), - ProposerAcceptorMessage::NoFlushAppendRequest(msg) => { - self.handle_append_request(msg, false) + ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg).await, + ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg).await, + ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg).await, + ProposerAcceptorMessage::AppendRequest(msg) => { + self.handle_append_request(msg, true).await } - ProposerAcceptorMessage::FlushWAL => self.handle_flush(), + ProposerAcceptorMessage::NoFlushAppendRequest(msg) => { + self.handle_append_request(msg, false).await + } + ProposerAcceptorMessage::FlushWAL => self.handle_flush().await, } } /// Handle initial message from proposer: check its sanity and send my /// current term. - fn handle_greeting( + async fn handle_greeting( &mut self, msg: &ProposerGreeting, ) -> Result> { @@ -634,7 +637,8 @@ where } // system_id will be updated on mismatch - if self.state.server.system_id != msg.system_id { + // sync-safekeepers doesn't know sysid and sends 0, ignore it + if self.state.server.system_id != msg.system_id && msg.system_id != 0 { if self.state.server.system_id != 0 { warn!( "unexpected system ID arrived, got {}, expected {}", @@ -647,7 +651,7 @@ where if msg.pg_version != UNKNOWN_SERVER_VERSION { state.server.pg_version = msg.pg_version; } - self.state.persist(&state)?; + self.state.persist(&state).await?; } info!( @@ -662,7 +666,7 @@ where } /// Give vote for the given term, if we haven't done that previously. - fn handle_vote_request( + async fn handle_vote_request( &mut self, msg: &VoteRequest, ) -> Result> { @@ -676,7 +680,7 @@ where // handle_elected instead. Currently not a big deal, as proposer is the // only source of WAL; with peer2peer recovery it would be more // important. - self.wal_store.flush_wal()?; + self.wal_store.flush_wal().await?; // initialize with refusal let mut resp = VoteResponse { term: self.state.acceptor_state.term, @@ -690,7 +694,7 @@ where let mut state = self.state.clone(); state.acceptor_state.term = msg.term; // persist vote before sending it out - self.state.persist(&state)?; + self.state.persist(&state).await?; resp.term = self.state.acceptor_state.term; resp.vote_given = true as u64; @@ -713,12 +717,15 @@ where ar } - fn handle_elected(&mut self, msg: &ProposerElected) -> Result> { + async fn handle_elected( + &mut self, + msg: &ProposerElected, + ) -> Result> { info!("received ProposerElected {:?}", msg); if self.state.acceptor_state.term < msg.term { let mut state = self.state.clone(); state.acceptor_state.term = msg.term; - self.state.persist(&state)?; + self.state.persist(&state).await?; } // If our term is higher, ignore the message (next feedback will inform the compute) @@ -748,7 +755,7 @@ where // intersection of our history and history from msg // truncate wal, update the LSNs - self.wal_store.truncate_wal(msg.start_streaming_at)?; + self.wal_store.truncate_wal(msg.start_streaming_at).await?; // and now adopt term history from proposer { @@ -782,7 +789,7 @@ where self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); state.acceptor_state.term_history = msg.term_history.clone(); - self.persist_control_file(state)?; + self.persist_control_file(state).await?; } info!("start receiving WAL since {:?}", msg.start_streaming_at); @@ -794,7 +801,7 @@ where /// /// Note: it is assumed that 'WAL we have is from the right term' check has /// already been done outside. - fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> { + async fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> { // Both peers and walproposer communicate this value, we might already // have a fresher (higher) version. candidate = max(candidate, self.inmem.commit_lsn); @@ -816,29 +823,52 @@ where // that we receive new epoch_start_lsn, and we still need to sync // control file in this case. if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn { - self.persist_control_file(self.state.clone())?; + self.persist_control_file(self.state.clone()).await?; } Ok(()) } /// Persist control file to disk, called only after timeline creation (bootstrap). - pub fn persist(&mut self) -> Result<()> { - self.persist_control_file(self.state.clone()) + pub async fn persist(&mut self) -> Result<()> { + self.persist_control_file(self.state.clone()).await } /// Persist in-memory state to the disk, taking other data from state. - fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> { + async fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> { state.commit_lsn = self.inmem.commit_lsn; state.backup_lsn = self.inmem.backup_lsn; state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; state.proposer_uuid = self.inmem.proposer_uuid; - self.state.persist(&state) + self.state.persist(&state).await + } + + /// Persist control file if there is something to save and enough time + /// passed after the last save. + pub async fn maybe_persist_control_file( + &mut self, + inmem_remote_consistent_lsn: Lsn, + ) -> Result<()> { + const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300); + if self.state.last_persist_at().elapsed() < CF_SAVE_INTERVAL { + return Ok(()); + } + let need_persist = self.inmem.commit_lsn > self.state.commit_lsn + || self.inmem.backup_lsn > self.state.backup_lsn + || self.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn + || inmem_remote_consistent_lsn > self.state.remote_consistent_lsn; + if need_persist { + let mut state = self.state.clone(); + state.remote_consistent_lsn = inmem_remote_consistent_lsn; + self.persist_control_file(state).await?; + trace!("saved control file: {CF_SAVE_INTERVAL:?} passed"); + } + Ok(()) } /// Handle request to append WAL. #[allow(clippy::comparison_chain)] - fn handle_append_request( + async fn handle_append_request( &mut self, msg: &AppendRequest, require_flush: bool, @@ -861,17 +891,19 @@ where // do the job if !msg.wal_data.is_empty() { - self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?; + self.wal_store + .write_wal(msg.h.begin_lsn, &msg.wal_data) + .await?; } // flush wal to the disk, if required if require_flush { - self.wal_store.flush_wal()?; + self.wal_store.flush_wal().await?; } // Update commit_lsn. if msg.h.commit_lsn != Lsn(0) { - self.update_commit_lsn(msg.h.commit_lsn)?; + self.update_commit_lsn(msg.h.commit_lsn).await?; } // Value calculated by walproposer can always lag: // - safekeepers can forget inmem value and send to proposer lower @@ -887,7 +919,7 @@ where if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) < self.inmem.peer_horizon_lsn { - self.persist_control_file(self.state.clone())?; + self.persist_control_file(self.state.clone()).await?; } trace!( @@ -909,15 +941,15 @@ where } /// Flush WAL to disk. Return AppendResponse with latest LSNs. - fn handle_flush(&mut self) -> Result> { - self.wal_store.flush_wal()?; + async fn handle_flush(&mut self) -> Result> { + self.wal_store.flush_wal().await?; Ok(Some(AcceptorProposerMessage::AppendResponse( self.append_response(), ))) } /// Update timeline state with peer safekeeper data. - pub fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> { + pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> { let mut sync_control_file = false; if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) { @@ -925,7 +957,7 @@ where // commit_lsn if our history matches (is part of) history of advanced // commit_lsn provider. if sk_info.last_log_term == self.get_epoch() { - self.update_commit_lsn(Lsn(sk_info.commit_lsn))?; + self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?; } } @@ -948,11 +980,10 @@ where if sync_control_file { let mut state = self.state.clone(); - // Note: we do not persist remote_consistent_lsn in other paths of - // persisting cf -- that is not much needed currently. We could do - // that by storing Arc to walsenders in Safekeeper. + // Note: we could make remote_consistent_lsn update in cf common by + // storing Arc to walsenders in Safekeeper. state.remote_consistent_lsn = new_remote_consistent_lsn; - self.persist_control_file(state)?; + self.persist_control_file(state).await?; } Ok(()) } @@ -976,22 +1007,28 @@ where #[cfg(test)] mod tests { + use futures::future::BoxFuture; use postgres_ffi::WAL_SEGMENT_SIZE; use super::*; use crate::wal_storage::Storage; - use std::ops::Deref; + use std::{ops::Deref, time::Instant}; // fake storage for tests struct InMemoryState { persisted_state: SafeKeeperState, } + #[async_trait::async_trait] impl control_file::Storage for InMemoryState { - fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { + async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { self.persisted_state = s.clone(); Ok(()) } + + fn last_persist_at(&self) -> Instant { + Instant::now() + } } impl Deref for InMemoryState { @@ -1014,27 +1051,28 @@ mod tests { lsn: Lsn, } + #[async_trait::async_trait] impl wal_storage::Storage for DummyWalStore { fn flush_lsn(&self) -> Lsn { self.lsn } - fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { + async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { self.lsn = startpos + buf.len() as u64; Ok(()) } - fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { + async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { self.lsn = end_pos; Ok(()) } - fn flush_wal(&mut self) -> Result<()> { + async fn flush_wal(&mut self) -> Result<()> { Ok(()) } - fn remove_up_to(&self) -> Box Result<()>> { - Box::new(move |_segno_up_to: XLogSegNo| Ok(())) + fn remove_up_to(&self, _segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> { + Box::pin(async { Ok(()) }) } fn get_metrics(&self) -> crate::metrics::WalStorageMetrics { @@ -1042,8 +1080,8 @@ mod tests { } } - #[test] - fn test_voting() { + #[tokio::test] + async fn test_voting() { let storage = InMemoryState { persisted_state: test_sk_state(), }; @@ -1052,7 +1090,7 @@ mod tests { // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); - let mut vote_resp = sk.process_msg(&vote_request); + let mut vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0), r => panic!("unexpected response: {:?}", r), @@ -1067,15 +1105,15 @@ mod tests { sk = SafeKeeper::new(storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok - vote_resp = sk.process_msg(&vote_request); + vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0), r => panic!("unexpected response: {:?}", r), } } - #[test] - fn test_epoch_switch() { + #[tokio::test] + async fn test_epoch_switch() { let storage = InMemoryState { persisted_state: test_sk_state(), }; @@ -1107,10 +1145,13 @@ mod tests { timeline_start_lsn: Lsn(0), }; sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) + .await .unwrap(); // check that AppendRequest before epochStartLsn doesn't switch epoch - let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)); + let resp = sk + .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await; assert!(resp.is_ok()); assert_eq!(sk.get_epoch(), 0); @@ -1121,9 +1162,11 @@ mod tests { h: ar_hdr, wal_data: Bytes::from_static(b"b"), }; - let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)); + let resp = sk + .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await; assert!(resp.is_ok()); - sk.wal_store.truncate_wal(Lsn(3)).unwrap(); // imitate the complete record at 3 %) + sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %) assert_eq!(sk.get_epoch(), 1); } } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index fb420cba64..abca0a86b1 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -396,7 +396,7 @@ impl SafekeeperPostgresHandler { // on this safekeeper itself. That's ok as (old) proposer will never be // able to commit such WAL. let stop_pos: Option = if self.is_walproposer_recovery() { - let wal_end = tli.get_flush_lsn(); + let wal_end = tli.get_flush_lsn().await; Some(wal_end) } else { None @@ -418,7 +418,7 @@ impl SafekeeperPostgresHandler { // switch to copy pgb.write_message(&BeMessage::CopyBothResponse).await?; - let (_, persisted_state) = tli.get_state(); + let (_, persisted_state) = tli.get_state().await; let wal_reader = WalReader::new( self.conf.workdir.clone(), self.conf.timeline_dir(&tli.ttid), @@ -562,7 +562,7 @@ impl WalSender<'_, IO> { .walsenders .get_ws_remote_consistent_lsn(self.ws_guard.id) { - if self.tli.should_walsender_stop(remote_consistent_lsn) { + if self.tli.should_walsender_stop(remote_consistent_lsn).await { // Terminate if there is nothing more to send. return Err(CopyStreamHandlerEnd::ServerInitiated(format!( "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 2dbf215998..52c3e8d4be 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -2,12 +2,13 @@ //! to glue together SafeKeeper and all other background services. use anyhow::{anyhow, bail, Result}; -use parking_lot::{Mutex, MutexGuard}; use postgres_ffi::XLogSegNo; +use tokio::fs; use std::cmp::max; use std::path::PathBuf; use std::sync::Arc; +use tokio::sync::{Mutex, MutexGuard}; use tokio::{ sync::{mpsc::Sender, watch}, time::Instant, @@ -234,7 +235,6 @@ impl SharedState { flush_lsn: self.sk.wal_store.flush_lsn().0, // note: this value is not flushed to control file yet and can be lost commit_lsn: self.sk.inmem.commit_lsn.0, - // TODO: rework feedbacks to avoid max here remote_consistent_lsn: remote_consistent_lsn.0, peer_horizon_lsn: self.sk.inmem.peer_horizon_lsn.0, safekeeper_connstr: conf.listen_pg_addr.clone(), @@ -287,8 +287,9 @@ pub struct Timeline { commit_lsn_watch_tx: watch::Sender, commit_lsn_watch_rx: watch::Receiver, - /// Safekeeper and other state, that should remain consistent and synchronized - /// with the disk. + /// Safekeeper and other state, that should remain consistent and + /// synchronized with the disk. This is tokio mutex as we write WAL to disk + /// while holding it, ensuring that consensus checks are in order. mutex: Mutex, walsenders: Arc, @@ -362,8 +363,8 @@ impl Timeline { /// /// Bootstrap is transactional, so if it fails, created files will be deleted, /// and state on disk should remain unchanged. - pub fn bootstrap(&self, shared_state: &mut MutexGuard) -> Result<()> { - match std::fs::metadata(&self.timeline_dir) { + pub async fn bootstrap(&self, shared_state: &mut MutexGuard<'_, SharedState>) -> Result<()> { + match fs::metadata(&self.timeline_dir).await { Ok(_) => { // Timeline directory exists on disk, we should leave state unchanged // and return error. @@ -376,53 +377,51 @@ impl Timeline { } // Create timeline directory. - std::fs::create_dir_all(&self.timeline_dir)?; + fs::create_dir_all(&self.timeline_dir).await?; // Write timeline to disk and TODO: start background tasks. - match || -> Result<()> { - shared_state.sk.persist()?; - // TODO: add more initialization steps here - self.update_status(shared_state); - Ok(()) - }() { - Ok(_) => Ok(()), - Err(e) => { - // Bootstrap failed, cancel timeline and remove timeline directory. - self.cancel(shared_state); + if let Err(e) = shared_state.sk.persist().await { + // Bootstrap failed, cancel timeline and remove timeline directory. + self.cancel(shared_state); - if let Err(fs_err) = std::fs::remove_dir_all(&self.timeline_dir) { - warn!( - "failed to remove timeline {} directory after bootstrap failure: {}", - self.ttid, fs_err - ); - } - - Err(e) + if let Err(fs_err) = fs::remove_dir_all(&self.timeline_dir).await { + warn!( + "failed to remove timeline {} directory after bootstrap failure: {}", + self.ttid, fs_err + ); } + + return Err(e); } + + // TODO: add more initialization steps here + self.update_status(shared_state); + Ok(()) } /// Delete timeline from disk completely, by removing timeline directory. Background /// timeline activities will stop eventually. - pub fn delete_from_disk( + pub async fn delete_from_disk( &self, - shared_state: &mut MutexGuard, + shared_state: &mut MutexGuard<'_, SharedState>, ) -> Result<(bool, bool)> { let was_active = shared_state.active; self.cancel(shared_state); - let dir_existed = delete_dir(&self.timeline_dir)?; + let dir_existed = delete_dir(&self.timeline_dir).await?; Ok((dir_existed, was_active)) } /// Cancel timeline to prevent further usage. Background tasks will stop /// eventually after receiving cancellation signal. - fn cancel(&self, shared_state: &mut MutexGuard) { + /// + /// Note that we can't notify backup launcher here while holding + /// shared_state lock, as this is a potential deadlock: caller is + /// responsible for that. Generally we should probably make WAL backup tasks + /// to shut down on their own, checking once in a while whether it is the + /// time. + fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) { info!("timeline {} is cancelled", self.ttid); let _ = self.cancellation_tx.send(true); - let res = self.wal_backup_launcher_tx.blocking_send(self.ttid); - if let Err(e) = res { - error!("Failed to send stop signal to wal_backup_launcher: {}", e); - } // Close associated FDs. Nobody will be able to touch timeline data once // it is cancelled, so WAL storage won't be opened again. shared_state.sk.wal_store.close(); @@ -434,8 +433,8 @@ impl Timeline { } /// Take a writing mutual exclusive lock on timeline shared_state. - pub fn write_shared_state(&self) -> MutexGuard { - self.mutex.lock() + pub async fn write_shared_state(&self) -> MutexGuard { + self.mutex.lock().await } fn update_status(&self, shared_state: &mut SharedState) -> bool { @@ -451,7 +450,7 @@ impl Timeline { let is_wal_backup_action_pending: bool; { - let mut shared_state = self.write_shared_state(); + let mut shared_state = self.write_shared_state().await; shared_state.num_computes += 1; is_wal_backup_action_pending = self.update_status(&mut shared_state); } @@ -465,22 +464,17 @@ impl Timeline { /// De-register compute connection, shutting down timeline activity if /// pageserver doesn't need catchup. - pub fn on_compute_disconnect(&self) -> Result<()> { + pub async fn on_compute_disconnect(&self) -> Result<()> { let is_wal_backup_action_pending: bool; { - let mut shared_state = self.write_shared_state(); + let mut shared_state = self.write_shared_state().await; shared_state.num_computes -= 1; is_wal_backup_action_pending = self.update_status(&mut shared_state); } // Wake up wal backup launcher, if it is time to stop the offloading. if is_wal_backup_action_pending { // Can fail only if channel to a static thread got closed, which is not normal at all. - // - // Note: this is blocking_send because on_compute_disconnect is called in Drop, there is - // no async Drop and we use current thread runtimes. With current thread rt spawning - // task in drop impl is racy, as thread along with runtime might finish before the task. - // This should be switched send.await when/if we go to full async. - self.wal_backup_launcher_tx.blocking_send(self.ttid)?; + self.wal_backup_launcher_tx.send(self.ttid).await?; } Ok(()) } @@ -490,11 +484,11 @@ impl Timeline { /// computes. While there might be nothing to stream already, we learn about /// remote_consistent_lsn update through replication feedback, and we want /// to stop pushing to the broker if pageserver is fully caughtup. - pub fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool { + pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool { if self.is_cancelled() { return true; } - let shared_state = self.write_shared_state(); + let shared_state = self.write_shared_state().await; if shared_state.num_computes == 0 { return shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn; @@ -504,12 +498,12 @@ impl Timeline { /// Returns whether s3 offloading is required and sets current status as /// matching it. - pub fn wal_backup_attend(&self) -> bool { + pub async fn wal_backup_attend(&self) -> bool { if self.is_cancelled() { return false; } - self.write_shared_state().wal_backup_attend() + self.write_shared_state().await.wal_backup_attend() } /// Returns commit_lsn watch channel. @@ -518,7 +512,7 @@ impl Timeline { } /// Pass arrived message to the safekeeper. - pub fn process_msg( + pub async fn process_msg( &self, msg: &ProposerAcceptorMessage, ) -> Result> { @@ -529,8 +523,8 @@ impl Timeline { let mut rmsg: Option; let commit_lsn: Lsn; { - let mut shared_state = self.write_shared_state(); - rmsg = shared_state.sk.process_msg(msg)?; + let mut shared_state = self.write_shared_state().await; + rmsg = shared_state.sk.process_msg(msg).await?; // if this is AppendResponse, fill in proper pageserver and hot // standby feedback. @@ -547,37 +541,37 @@ impl Timeline { } /// Returns wal_seg_size. - pub fn get_wal_seg_size(&self) -> usize { - self.write_shared_state().get_wal_seg_size() + pub async fn get_wal_seg_size(&self) -> usize { + self.write_shared_state().await.get_wal_seg_size() } /// Returns true only if the timeline is loaded and active. - pub fn is_active(&self) -> bool { + pub async fn is_active(&self) -> bool { if self.is_cancelled() { return false; } - self.write_shared_state().active + self.write_shared_state().await.active } /// Returns state of the timeline. - pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { - let state = self.write_shared_state(); + pub async fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { + let state = self.write_shared_state().await; (state.sk.inmem.clone(), state.sk.state.clone()) } /// Returns latest backup_lsn. - pub fn get_wal_backup_lsn(&self) -> Lsn { - self.write_shared_state().sk.inmem.backup_lsn + pub async fn get_wal_backup_lsn(&self) -> Lsn { + self.write_shared_state().await.sk.inmem.backup_lsn } /// Sets backup_lsn to the given value. - pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> { + pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> { if self.is_cancelled() { bail!(TimelineError::Cancelled(self.ttid)); } - let mut state = self.write_shared_state(); + let mut state = self.write_shared_state().await; state.sk.inmem.backup_lsn = max(state.sk.inmem.backup_lsn, backup_lsn); // we should check whether to shut down offloader, but this will be done // soon by peer communication anyway. @@ -585,8 +579,8 @@ impl Timeline { } /// Get safekeeper info for broadcasting to broker and other peers. - pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo { - let shared_state = self.write_shared_state(); + pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo { + let shared_state = self.write_shared_state().await; shared_state.get_safekeeper_info( &self.ttid, conf, @@ -605,8 +599,8 @@ impl Timeline { let is_wal_backup_action_pending: bool; let commit_lsn: Lsn; { - let mut shared_state = self.write_shared_state(); - shared_state.sk.record_safekeeper_info(&sk_info)?; + let mut shared_state = self.write_shared_state().await; + shared_state.sk.record_safekeeper_info(&sk_info).await?; let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now()); shared_state.peers_info.upsert(&peer_info); is_wal_backup_action_pending = self.update_status(&mut shared_state); @@ -623,8 +617,8 @@ impl Timeline { /// Get our latest view of alive peers status on the timeline. /// We pass our own info through the broker as well, so when we don't have connection /// to the broker returned vec is empty. - pub fn get_peers(&self, conf: &SafeKeeperConf) -> Vec { - let shared_state = self.write_shared_state(); + pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec { + let shared_state = self.write_shared_state().await; let now = Instant::now(); shared_state .peers_info @@ -641,47 +635,60 @@ impl Timeline { } /// Returns flush_lsn. - pub fn get_flush_lsn(&self) -> Lsn { - self.write_shared_state().sk.wal_store.flush_lsn() + pub async fn get_flush_lsn(&self) -> Lsn { + self.write_shared_state().await.sk.wal_store.flush_lsn() } /// Delete WAL segments from disk that are no longer needed. This is determined /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn. - pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { + pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { if self.is_cancelled() { bail!(TimelineError::Cancelled(self.ttid)); } let horizon_segno: XLogSegNo; - let remover: Box Result<(), anyhow::Error>>; - { - let shared_state = self.write_shared_state(); + let remover = { + let shared_state = self.write_shared_state().await; horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled); - remover = shared_state.sk.wal_store.remove_up_to(); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { - return Ok(()); + return Ok(()); // nothing to do } + let remover = shared_state.sk.wal_store.remove_up_to(horizon_segno - 1); // release the lock before removing - } + remover + }; // delete old WAL files - remover(horizon_segno - 1)?; + remover.await?; // update last_removed_segno - let mut shared_state = self.write_shared_state(); + let mut shared_state = self.write_shared_state().await; shared_state.last_removed_segno = horizon_segno; Ok(()) } - /// Returns full timeline info, required for the metrics. If the timeline is - /// not active, returns None instead. - pub fn info_for_metrics(&self) -> Option { + /// Persist control file if there is something to save and enough time + /// passed after the last save. This helps to keep remote_consistent_lsn up + /// to date so that storage nodes restart doesn't cause many pageserver -> + /// safekeeper reconnections. + pub async fn maybe_persist_control_file(&self) -> Result<()> { + let remote_consistent_lsn = self.walsenders.get_remote_consistent_lsn(); + self.write_shared_state() + .await + .sk + .maybe_persist_control_file(remote_consistent_lsn) + .await + } + + /// Gather timeline data for metrics. If the timeline is not active, returns + /// None, we do not collect these. + pub async fn info_for_metrics(&self) -> Option { if self.is_cancelled() { return None; } let ps_feedback = self.walsenders.get_ps_feedback(); - let state = self.write_shared_state(); + let state = self.write_shared_state().await; if state.active { Some(FullTimelineInfo { ttid: self.ttid, @@ -703,8 +710,8 @@ impl Timeline { } /// Returns in-memory timeline state to build a full debug dump. - pub fn memory_dump(&self) -> debug_dump::Memory { - let state = self.write_shared_state(); + pub async fn memory_dump(&self) -> debug_dump::Memory { + let state = self.write_shared_state().await; let (write_lsn, write_record_lsn, flush_lsn, file_open) = state.sk.wal_store.internal_state(); @@ -728,8 +735,8 @@ impl Timeline { } /// Deletes directory and it's contents. Returns false if directory does not exist. -fn delete_dir(path: &PathBuf) -> Result { - match std::fs::remove_dir_all(path) { +async fn delete_dir(path: &PathBuf) -> Result { + match fs::remove_dir_all(path).await { Ok(_) => Ok(true), Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), Err(e) => Err(e.into()), diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 41809794dc..f2d5df8744 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -113,9 +113,17 @@ impl GlobalTimelines { Ok(()) } - /// Loads all timelines for the given tenant to memory. Returns fs::read_dir errors if any. + /// Loads all timelines for the given tenant to memory. Returns fs::read_dir + /// errors if any. + /// + /// Note: This function (and all reading/loading below) is sync because + /// timelines are loaded while holding GlobalTimelinesState lock. Which is + /// fine as this is called only from single threaded main runtime on boot, + /// but clippy complains anyway, and suppressing that isn't trivial as async + /// is the keyword, ha. That only other user is pull_timeline.rs for which + /// being blocked is not that bad, and we can do spawn_blocking. fn load_tenant_timelines( - state: &mut MutexGuard, + state: &mut MutexGuard<'_, GlobalTimelinesState>, tenant_id: TenantId, ) -> Result<()> { let timelines_dir = state.get_conf().tenant_dir(&tenant_id); @@ -220,7 +228,7 @@ impl GlobalTimelines { // Take a lock and finish the initialization holding this mutex. No other threads // can interfere with creation after we will insert timeline into the map. { - let mut shared_state = timeline.write_shared_state(); + let mut shared_state = timeline.write_shared_state().await; // We can get a race condition here in case of concurrent create calls, but only // in theory. create() will return valid timeline on the next try. @@ -232,7 +240,7 @@ impl GlobalTimelines { // Write the new timeline to the disk and start background workers. // Bootstrap is transactional, so if it fails, the timeline will be deleted, // and the state on disk should remain unchanged. - if let Err(e) = timeline.bootstrap(&mut shared_state) { + if let Err(e) = timeline.bootstrap(&mut shared_state).await { // Note: the most likely reason for bootstrap failure is that the timeline // directory already exists on disk. This happens when timeline is corrupted // and wasn't loaded from disk on startup because of that. We want to preserve @@ -294,15 +302,16 @@ impl GlobalTimelines { } /// Cancels timeline, then deletes the corresponding data directory. - pub fn delete_force(ttid: &TenantTimelineId) -> Result { + pub async fn delete_force(ttid: &TenantTimelineId) -> Result { let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid); match tli_res { Ok(timeline) => { // Take a lock and finish the deletion holding this mutex. - let mut shared_state = timeline.write_shared_state(); + let mut shared_state = timeline.write_shared_state().await; info!("deleting timeline {}", ttid); - let (dir_existed, was_active) = timeline.delete_from_disk(&mut shared_state)?; + let (dir_existed, was_active) = + timeline.delete_from_disk(&mut shared_state).await?; // Remove timeline from the map. // FIXME: re-enable it once we fix the issue with recreation of deleted timelines @@ -335,7 +344,7 @@ impl GlobalTimelines { /// the tenant had, `true` if a timeline was active. There may be a race if new timelines are /// created simultaneously. In that case the function will return error and the caller should /// retry tenant deletion again later. - pub fn delete_force_all_for_tenant( + pub async fn delete_force_all_for_tenant( tenant_id: &TenantId, ) -> Result> { info!("deleting all timelines for tenant {}", tenant_id); @@ -345,7 +354,7 @@ impl GlobalTimelines { let mut deleted = HashMap::new(); for tli in &to_delete { - match Self::delete_force(&tli.ttid) { + match Self::delete_force(&tli.ttid).await { Ok(result) => { deleted.insert(tli.ttid, result); } diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 4d341a7ef8..eae3f3fe86 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -17,7 +17,6 @@ use postgres_ffi::XLogFileName; use postgres_ffi::{XLogSegNo, PG_TLI}; use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::fs::File; -use tokio::runtime::Builder; use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; @@ -36,30 +35,16 @@ use once_cell::sync::OnceCell; const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10; const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; -pub fn wal_backup_launcher_thread_main( - conf: SafeKeeperConf, - wal_backup_launcher_rx: Receiver, -) { - let mut builder = Builder::new_multi_thread(); - if let Some(num_threads) = conf.backup_runtime_threads { - builder.worker_threads(num_threads); - } - let rt = builder - .enable_all() - .build() - .expect("failed to create wal backup runtime"); - - rt.block_on(async { - wal_backup_launcher_main_loop(conf, wal_backup_launcher_rx).await; - }); -} - /// Check whether wal backup is required for timeline. If yes, mark that launcher is /// aware of current status and return the timeline. -fn is_wal_backup_required(ttid: TenantTimelineId) -> Option> { - GlobalTimelines::get(ttid) - .ok() - .filter(|tli| tli.wal_backup_attend()) +async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option> { + match GlobalTimelines::get(ttid).ok() { + Some(tli) => { + tli.wal_backup_attend().await; + Some(tli) + } + None => None, + } } struct WalBackupTaskHandle { @@ -143,8 +128,8 @@ async fn update_task( ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry, ) { - let alive_peers = entry.timeline.get_peers(conf); - let wal_backup_lsn = entry.timeline.get_wal_backup_lsn(); + let alive_peers = entry.timeline.get_peers(conf).await; + let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await; let (offloader, election_dbg_str) = determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf); let elected_me = Some(conf.my_id) == offloader; @@ -183,10 +168,10 @@ const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; /// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup /// tasks. Having this in separate task simplifies locking, allows to reap /// panics and separate elections from offloading itself. -async fn wal_backup_launcher_main_loop( +pub async fn wal_backup_launcher_task_main( conf: SafeKeeperConf, mut wal_backup_launcher_rx: Receiver, -) { +) -> anyhow::Result<()> { info!( "WAL backup launcher started, remote config {:?}", conf.remote_storage @@ -214,7 +199,7 @@ async fn wal_backup_launcher_main_loop( if conf.remote_storage.is_none() || !conf.wal_backup_enabled { continue; /* just drain the channel and do nothing */ } - let timeline = is_wal_backup_required(ttid); + let timeline = is_wal_backup_required(ttid).await; // do we need to do anything at all? if timeline.is_some() != tasks.contains_key(&ttid) { if let Some(timeline) = timeline { @@ -269,7 +254,7 @@ async fn backup_task_main( let tli = res.unwrap(); let mut wb = WalBackupTask { - wal_seg_size: tli.get_wal_seg_size(), + wal_seg_size: tli.get_wal_seg_size().await, commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), timeline: tli, timeline_dir, @@ -326,7 +311,7 @@ impl WalBackupTask { continue; /* nothing to do, common case as we wake up on every commit_lsn bump */ } // Perhaps peers advanced the position, check shmem value. - backup_lsn = self.timeline.get_wal_backup_lsn(); + backup_lsn = self.timeline.get_wal_backup_lsn().await; if backup_lsn.segment_number(self.wal_seg_size) >= commit_lsn.segment_number(self.wal_seg_size) { @@ -402,6 +387,7 @@ pub async fn backup_lsn_range( let new_backup_lsn = segment.end_lsn; timeline .set_wal_backup_lsn(new_backup_lsn) + .await .context("setting wal_backup_lsn")?; *backup_lsn = new_backup_lsn; } else { diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index fb0d77a9f2..406132b2b0 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -4,7 +4,7 @@ //! use anyhow::{Context, Result}; use postgres_backend::QueryError; -use std::{future, thread, time::Duration}; +use std::{future, time::Duration}; use tokio::net::TcpStream; use tokio_io_timeout::TimeoutReader; use tracing::*; @@ -16,104 +16,82 @@ use crate::SafeKeeperConf; use postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. -pub fn thread_main(conf: SafeKeeperConf, pg_listener: std::net::TcpListener) { - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .context("create runtime") - // todo catch error in main thread - .expect("failed to create runtime"); +pub async fn task_main( + conf: SafeKeeperConf, + pg_listener: std::net::TcpListener, +) -> anyhow::Result<()> { + // Tokio's from_std won't do this for us, per its comment. + pg_listener.set_nonblocking(true)?; - runtime - .block_on(async move { - // Tokio's from_std won't do this for us, per its comment. - pg_listener.set_nonblocking(true)?; - let listener = tokio::net::TcpListener::from_std(pg_listener)?; - let mut connection_count: ConnectionCount = 0; + let listener = tokio::net::TcpListener::from_std(pg_listener)?; + let mut connection_count: ConnectionCount = 0; - loop { - match listener.accept().await { - Ok((socket, peer_addr)) => { - debug!("accepted connection from {}", peer_addr); - let conf = conf.clone(); - let conn_id = issue_connection_id(&mut connection_count); + loop { + let (socket, peer_addr) = listener.accept().await.context("accept")?; + debug!("accepted connection from {}", peer_addr); + let conf = conf.clone(); + let conn_id = issue_connection_id(&mut connection_count); - let _ = thread::Builder::new() - .name("WAL service thread".into()) - .spawn(move || { - if let Err(err) = handle_socket(socket, conf, conn_id) { - error!("connection handler exited: {}", err); - } - }) - .unwrap(); - } - Err(e) => error!("Failed to accept connection: {}", e), - } + tokio::spawn(async move { + if let Err(err) = handle_socket(socket, conf, conn_id) + .instrument(info_span!("", cid = %conn_id)) + .await + { + error!("connection handler exited: {}", err); } - #[allow(unreachable_code)] // hint compiler the closure return type - Ok::<(), anyhow::Error>(()) - }) - .expect("listener failed") + }); + } } -/// This is run by `thread_main` above, inside a background thread. +/// This is run by `task_main` above, inside a background thread. /// -fn handle_socket( +async fn handle_socket( socket: TcpStream, conf: SafeKeeperConf, conn_id: ConnectionId, ) -> Result<(), QueryError> { - let _enter = info_span!("", cid = %conn_id).entered(); - - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?; - socket.set_nodelay(true)?; let peer_addr = socket.peer_addr()?; - // TimeoutReader wants async runtime during creation. - runtime.block_on(async move { - // Set timeout on reading from the socket. It prevents hanged up connection - // if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by - // default, and tokio doesn't provide ability to set it out of the box. - let mut socket = TimeoutReader::new(socket); - let wal_service_timeout = Duration::from_secs(60 * 10); - socket.set_timeout(Some(wal_service_timeout)); - // pin! is here because TimeoutReader (due to storing sleep future inside) - // is not Unpin, and all pgbackend/framed/tokio dependencies require stream - // to be Unpin. Which is reasonable, as indeed something like TimeoutReader - // shouldn't be moved. - tokio::pin!(socket); + // Set timeout on reading from the socket. It prevents hanged up connection + // if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by + // default, and tokio doesn't provide ability to set it out of the box. + let mut socket = TimeoutReader::new(socket); + let wal_service_timeout = Duration::from_secs(60 * 10); + socket.set_timeout(Some(wal_service_timeout)); + // pin! is here because TimeoutReader (due to storing sleep future inside) + // is not Unpin, and all pgbackend/framed/tokio dependencies require stream + // to be Unpin. Which is reasonable, as indeed something like TimeoutReader + // shouldn't be moved. + tokio::pin!(socket); - let traffic_metrics = TrafficMetrics::new(); - if let Some(current_az) = conf.availability_zone.as_deref() { - traffic_metrics.set_sk_az(current_az); - } + let traffic_metrics = TrafficMetrics::new(); + if let Some(current_az) = conf.availability_zone.as_deref() { + traffic_metrics.set_sk_az(current_az); + } - let socket = MeasuredStream::new( - socket, - |cnt| { - traffic_metrics.observe_read(cnt); - }, - |cnt| { - traffic_metrics.observe_write(cnt); - }, - ); + let socket = MeasuredStream::new( + socket, + |cnt| { + traffic_metrics.observe_read(cnt); + }, + |cnt| { + traffic_metrics.observe_write(cnt); + }, + ); - let auth_type = match conf.auth { - None => AuthType::Trust, - Some(_) => AuthType::NeonJWT, - }; - let mut conn_handler = - SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone())); - let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; - // libpq protocol between safekeeper and walproposer / pageserver - // We don't use shutdown. - pgbackend - .run(&mut conn_handler, future::pending::<()>) - .await - }) + let auth_type = match conf.auth { + None => AuthType::Trust, + Some(_) => AuthType::NeonJWT, + }; + let mut conn_handler = + SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone())); + let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; + // libpq protocol between safekeeper and walproposer / pageserver + // We don't use shutdown. + pgbackend + .run(&mut conn_handler, future::pending::<()>) + .await } /// Unique WAL service connection ids are logged in spans for observability. diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 1b82bd754e..e97b212093 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -8,54 +8,47 @@ //! Note that last file has `.partial` suffix, that's different from postgres. use anyhow::{bail, Context, Result}; -use remote_storage::RemotePath; - -use std::io::{self, Seek, SeekFrom}; -use std::pin::Pin; -use tokio::io::AsyncRead; - +use bytes::Bytes; +use futures::future::BoxFuture; use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; use postgres_ffi::{XLogSegNo, PG_TLI}; +use remote_storage::RemotePath; use std::cmp::{max, min}; - -use bytes::Bytes; -use std::fs::{self, remove_file, File, OpenOptions}; -use std::io::Write; +use std::io::{self, SeekFrom}; use std::path::{Path, PathBuf}; - +use std::pin::Pin; +use tokio::fs::{self, remove_file, File, OpenOptions}; +use tokio::io::{AsyncRead, AsyncWriteExt}; +use tokio::io::{AsyncReadExt, AsyncSeekExt}; use tracing::*; -use utils::{id::TenantTimelineId, lsn::Lsn}; - use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS}; use crate::safekeeper::SafeKeeperState; - use crate::wal_backup::read_object; use crate::SafeKeeperConf; +use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::XLogFileName; use postgres_ffi::XLOG_BLCKSZ; - -use postgres_ffi::waldecoder::WalStreamDecoder; - use pq_proto::SystemId; -use tokio::io::{AsyncReadExt, AsyncSeekExt}; +use utils::{id::TenantTimelineId, lsn::Lsn}; +#[async_trait::async_trait] pub trait Storage { /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; /// Write piece of WAL from buf to disk, but not necessarily sync it. - fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; + async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; /// Truncate WAL at specified LSN, which must be the end of WAL record. - fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>; + async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>; /// Durably store WAL on disk, up to the last written WAL record. - fn flush_wal(&mut self) -> Result<()>; + async fn flush_wal(&mut self) -> Result<()>; - /// Remove all segments <= given segno. Returns closure as we want to do - /// that without timeline lock. - fn remove_up_to(&self) -> Box Result<()>>; + /// Remove all segments <= given segno. Returns function doing that as we + /// want to perform it without timeline lock. + fn remove_up_to(&self, segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>>; /// Release resources associated with the storage -- technically, close FDs. /// Currently we don't remove timelines until restart (#3146), so need to @@ -178,33 +171,37 @@ impl PhysicalStorage { } /// Call fdatasync if config requires so. - fn fdatasync_file(&mut self, file: &mut File) -> Result<()> { + async fn fdatasync_file(&mut self, file: &mut File) -> Result<()> { if !self.conf.no_sync { self.metrics - .observe_flush_seconds(time_io_closure(|| Ok(file.sync_data()?))?); + .observe_flush_seconds(time_io_closure(file.sync_data()).await?); } Ok(()) } /// Call fsync if config requires so. - fn fsync_file(&mut self, file: &mut File) -> Result<()> { + async fn fsync_file(&mut self, file: &mut File) -> Result<()> { if !self.conf.no_sync { self.metrics - .observe_flush_seconds(time_io_closure(|| Ok(file.sync_all()?))?); + .observe_flush_seconds(time_io_closure(file.sync_all()).await?); } Ok(()) } /// Open or create WAL segment file. Caller must call seek to the wanted position. /// Returns `file` and `is_partial`. - fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> { + async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> { let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; // Try to open already completed segment - if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { + if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path).await { Ok((file, false)) - } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) { + } else if let Ok(file) = OpenOptions::new() + .write(true) + .open(&wal_file_partial_path) + .await + { // Try to open existing partial file Ok((file, true)) } else { @@ -213,35 +210,36 @@ impl PhysicalStorage { .create(true) .write(true) .open(&wal_file_partial_path) + .await .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?; - write_zeroes(&mut file, self.wal_seg_size)?; - self.fsync_file(&mut file)?; + write_zeroes(&mut file, self.wal_seg_size).await?; + self.fsync_file(&mut file).await?; Ok((file, true)) } } /// Write WAL bytes, which are known to be located in a single WAL segment. - fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> { + async fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> { let mut file = if let Some(file) = self.file.take() { file } else { - let (mut file, is_partial) = self.open_or_create(segno)?; + let (mut file, is_partial) = self.open_or_create(segno).await?; assert!(is_partial, "unexpected write into non-partial segment file"); - file.seek(SeekFrom::Start(xlogoff as u64))?; + file.seek(SeekFrom::Start(xlogoff as u64)).await?; file }; - file.write_all(buf)?; + file.write_all(buf).await?; if xlogoff + buf.len() == self.wal_seg_size { // If we reached the end of a WAL segment, flush and close it. - self.fdatasync_file(&mut file)?; + self.fdatasync_file(&mut file).await?; // Rename partial file to completed file let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; - fs::rename(wal_file_partial_path, wal_file_path)?; + fs::rename(wal_file_partial_path, wal_file_path).await?; } else { // otherwise, file can be reused later self.file = Some(file); @@ -255,11 +253,11 @@ impl PhysicalStorage { /// be flushed separately later. /// /// Updates `write_lsn`. - fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> { + async fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> { if self.write_lsn != pos { // need to flush the file before discarding it if let Some(mut file) = self.file.take() { - self.fdatasync_file(&mut file)?; + self.fdatasync_file(&mut file).await?; } self.write_lsn = pos; @@ -277,7 +275,8 @@ impl PhysicalStorage { buf.len() }; - self.write_in_segment(segno, xlogoff, &buf[..bytes_write])?; + self.write_in_segment(segno, xlogoff, &buf[..bytes_write]) + .await?; self.write_lsn += bytes_write as u64; buf = &buf[bytes_write..]; } @@ -286,6 +285,7 @@ impl PhysicalStorage { } } +#[async_trait::async_trait] impl Storage for PhysicalStorage { /// flush_lsn returns LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn { @@ -293,7 +293,7 @@ impl Storage for PhysicalStorage { } /// Write WAL to disk. - fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { + async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { // Disallow any non-sequential writes, which can result in gaps or overwrites. // If we need to move the pointer, use truncate_wal() instead. if self.write_lsn > startpos { @@ -311,7 +311,7 @@ impl Storage for PhysicalStorage { ); } - let write_seconds = time_io_closure(|| self.write_exact(startpos, buf))?; + let write_seconds = time_io_closure(self.write_exact(startpos, buf)).await?; // WAL is written, updating write metrics self.metrics.observe_write_seconds(write_seconds); self.metrics.observe_write_bytes(buf.len()); @@ -340,14 +340,14 @@ impl Storage for PhysicalStorage { Ok(()) } - fn flush_wal(&mut self) -> Result<()> { + async fn flush_wal(&mut self) -> Result<()> { if self.flush_record_lsn == self.write_record_lsn { // no need to do extra flush return Ok(()); } if let Some(mut unflushed_file) = self.file.take() { - self.fdatasync_file(&mut unflushed_file)?; + self.fdatasync_file(&mut unflushed_file).await?; self.file = Some(unflushed_file); } else { // We have unflushed data (write_lsn != flush_lsn), but no file. @@ -369,7 +369,7 @@ impl Storage for PhysicalStorage { /// Truncate written WAL by removing all WAL segments after the given LSN. /// end_pos must point to the end of the WAL record. - fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { + async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { // Streaming must not create a hole, so truncate cannot be called on non-written lsn if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { bail!( @@ -379,29 +379,35 @@ impl Storage for PhysicalStorage { ); } + // Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on + // disk (this happens on each connect). + if end_pos == self.write_lsn { + return Ok(()); + } + // Close previously opened file, if any if let Some(mut unflushed_file) = self.file.take() { - self.fdatasync_file(&mut unflushed_file)?; + self.fdatasync_file(&mut unflushed_file).await?; } let xlogoff = end_pos.segment_offset(self.wal_seg_size); let segno = end_pos.segment_number(self.wal_seg_size); // Remove all segments after the given LSN. - remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno)?; + remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno).await?; - let (mut file, is_partial) = self.open_or_create(segno)?; + let (mut file, is_partial) = self.open_or_create(segno).await?; // Fill end with zeroes - file.seek(SeekFrom::Start(xlogoff as u64))?; - write_zeroes(&mut file, self.wal_seg_size - xlogoff)?; - self.fdatasync_file(&mut file)?; + file.seek(SeekFrom::Start(xlogoff as u64)).await?; + write_zeroes(&mut file, self.wal_seg_size - xlogoff).await?; + self.fdatasync_file(&mut file).await?; if !is_partial { // Make segment partial once again let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; - fs::rename(wal_file_path, wal_file_partial_path)?; + fs::rename(wal_file_path, wal_file_partial_path).await?; } // Update LSNs @@ -411,11 +417,11 @@ impl Storage for PhysicalStorage { Ok(()) } - fn remove_up_to(&self) -> Box Result<()>> { + fn remove_up_to(&self, segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> { let timeline_dir = self.timeline_dir.clone(); let wal_seg_size = self.wal_seg_size; - Box::new(move |segno_up_to: XLogSegNo| { - remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to) + Box::pin(async move { + remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to).await }) } @@ -430,7 +436,7 @@ impl Storage for PhysicalStorage { } /// Remove all WAL segments in timeline_dir that match the given predicate. -fn remove_segments_from_disk( +async fn remove_segments_from_disk( timeline_dir: &Path, wal_seg_size: usize, remove_predicate: impl Fn(XLogSegNo) -> bool, @@ -439,8 +445,8 @@ fn remove_segments_from_disk( let mut min_removed = u64::MAX; let mut max_removed = u64::MIN; - for entry in fs::read_dir(timeline_dir)? { - let entry = entry?; + let mut entries = fs::read_dir(timeline_dir).await?; + while let Some(entry) = entries.next_entry().await? { let entry_path = entry.path(); let fname = entry_path.file_name().unwrap(); @@ -451,7 +457,7 @@ fn remove_segments_from_disk( } let (segno, _) = XLogFromFileName(fname_str, wal_seg_size); if remove_predicate(segno) { - remove_file(entry_path)?; + remove_file(entry_path).await?; n_removed += 1; min_removed = min(min_removed, segno); max_removed = max(max_removed, segno); @@ -683,12 +689,12 @@ impl WalReader { const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ]; /// Helper for filling file with zeroes. -fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { +async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { while count >= XLOG_BLCKSZ { - file.write_all(ZERO_BLOCK)?; + file.write_all(ZERO_BLOCK).await?; count -= XLOG_BLCKSZ; } - file.write_all(&ZERO_BLOCK[0..count])?; + file.write_all(&ZERO_BLOCK[0..count]).await?; Ok(()) } diff --git a/scripts/pr-comment-test-report.js b/scripts/comment-test-report.js similarity index 85% rename from scripts/pr-comment-test-report.js rename to scripts/comment-test-report.js index 3a7bba0daa..a7fd5b0bef 100644 --- a/scripts/pr-comment-test-report.js +++ b/scripts/comment-test-report.js @@ -1,5 +1,5 @@ // -// The script parses Allure reports and posts a comment with a summary of the test results to the PR. +// The script parses Allure reports and posts a comment with a summary of the test results to the PR or to the latest commit in the branch. // // The comment is updated on each run with the latest results. // @@ -7,7 +7,7 @@ // - uses: actions/github-script@v6 // with: // script: | -// const script = require("./scripts/pr-comment-test-report.js") +// const script = require("./scripts/comment-test-report.js") // await script({ // github, // context, @@ -35,8 +35,12 @@ class DefaultMap extends Map { module.exports = async ({ github, context, fetch, report }) => { // Marker to find the comment in the subsequent runs const startMarker = `` + // If we run the script in the PR or in the branch (main/release/...) + const isPullRequest = !!context.payload.pull_request + // Latest commit in PR or in the branch + const commitSha = isPullRequest ? context.payload.pull_request.head.sha : context.sha // Let users know that the comment is updated automatically - const autoupdateNotice = `
The comment gets automatically updated with the latest test results
${context.payload.pull_request.head.sha} at ${new Date().toISOString()} :recycle:
` + const autoupdateNotice = `
The comment gets automatically updated with the latest test results
${commitSha} at ${new Date().toISOString()} :recycle:
` // GitHub bot id taken from (https://api.github.com/users/github-actions[bot]) const githubActionsBotId = 41898282 // Commend body itself @@ -166,22 +170,39 @@ module.exports = async ({ github, context, fetch, report }) => { commentBody += autoupdateNotice - const { data: comments } = await github.rest.issues.listComments({ - issue_number: context.payload.number, + let createCommentFn, listCommentsFn, updateCommentFn, issueNumberOrSha + if (isPullRequest) { + createCommentFn = github.rest.issues.createComment + listCommentsFn = github.rest.issues.listComments + updateCommentFn = github.rest.issues.updateComment + issueNumberOrSha = { + issue_number: context.payload.number, + } + } else { + updateCommentFn = github.rest.repos.updateCommitComment + listCommentsFn = github.rest.repos.listCommentsForCommit + createCommentFn = github.rest.repos.createCommitComment + issueNumberOrSha = { + commit_sha: commitSha, + } + } + + const { data: comments } = await listCommentsFn({ + ...issueNumberOrSha, ...ownerRepoParams, }) const comment = comments.find(comment => comment.user.id === githubActionsBotId && comment.body.startsWith(startMarker)) if (comment) { - await github.rest.issues.updateComment({ + await updateCommentFn({ comment_id: comment.id, body: commentBody, ...ownerRepoParams, }) } else { - await github.rest.issues.createComment({ - issue_number: context.payload.number, + await createCommentFn({ body: commentBody, + ...issueNumberOrSha, ...ownerRepoParams, }) } diff --git a/scripts/coverage b/scripts/coverage index 1dc92e57cc..52a69c93b9 100755 --- a/scripts/coverage +++ b/scripts/coverage @@ -156,7 +156,9 @@ class LLVM: profdata: Path, objects: List[str], sources: List[str], - demangler: Optional[Path] = None) -> None: + demangler: Optional[Path] = None, + output_file: Optional[Path] = None, + ) -> None: cwd = self.cargo.cwd objects = list(intersperse('-object', objects)) @@ -180,14 +182,18 @@ class LLVM: *objects, *sources, ] - subprocess.check_call(cmd, cwd=cwd) + if output_file is not None: + with output_file.open('w') as outfile: + subprocess.check_call(cmd, cwd=cwd, stdout=outfile) + else: + subprocess.check_call(cmd, cwd=cwd) def cov_report(self, **kwargs) -> None: self._cov(subcommand='report', **kwargs) - def cov_export(self, *, kind: str, **kwargs) -> None: + def cov_export(self, *, kind: str, output_file: Optional[Path], **kwargs) -> None: extras = (f'-format={kind}', ) - self._cov(subcommand='export', *extras, **kwargs) + self._cov(subcommand='export', *extras, output_file=output_file, **kwargs) def cov_show(self, *, kind: str, output_dir: Optional[Path] = None, **kwargs) -> None: extras = [f'-format={kind}'] @@ -283,9 +289,12 @@ class TextReport(Report): self.llvm.cov_show(kind='text', **self._common_kwargs()) +@dataclass class LcovReport(Report): + output_file: Path + def generate(self) -> None: - self.llvm.cov_export(kind='lcov', **self._common_kwargs()) + self.llvm.cov_export(kind='lcov', output_file=self.output_file, **self._common_kwargs()) @dataclass @@ -475,7 +484,7 @@ class State: 'text': lambda: TextReport(**params), 'lcov': - lambda: LcovReport(**params), + lambda: LcovReport(**params, output_file=self.report_dir / 'lcov.info'), 'summary': lambda: SummaryReport(**params), 'github': diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 4292c981a9..d95878b341 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -162,7 +162,7 @@ class PgProtocol: Returns psycopg2's connection object. This method passes all extra params to connstr. """ - conn = psycopg2.connect(**self.conn_options(**kwargs)) + conn: PgConnection = psycopg2.connect(**self.conn_options(**kwargs)) # WARNING: this setting affects *all* tests! conn.autocommit = autocommit @@ -535,8 +535,8 @@ def export_timeline( def main(args: argparse.Namespace): - # any psql version will do here. use current DEFAULT_PG_VERSION = 14 - psql_path = str(Path(args.pg_distrib_dir) / "v14" / "bin" / "psql") + # any psql version will do here. use current DEFAULT_PG_VERSION = 15 + psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql") old_pageserver_host = args.old_pageserver_host new_pageserver_host = args.new_pageserver_host diff --git a/scripts/ingest_perf_test_result.py b/scripts/ingest_perf_test_result.py index 7f2af290a2..1bfc907def 100644 --- a/scripts/ingest_perf_test_result.py +++ b/scripts/ingest_perf_test_result.py @@ -35,7 +35,7 @@ def get_connection_cursor(): connstr = os.getenv("DATABASE_URL") if not connstr: err("DATABASE_URL environment variable is not set") - with psycopg2.connect(connstr) as conn: + with psycopg2.connect(connstr, connect_timeout=30) as conn: with conn.cursor() as cur: yield cur diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index 8441aaf625..3f6fa35cbe 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -32,6 +32,7 @@ pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051"; pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}"); pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms"; +pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_millis(5000); // BrokerServiceClient charged with tonic provided Channel transport; helps to // avoid depending on tonic directly in user crates. @@ -40,6 +41,9 @@ pub type BrokerClientChannel = BrokerServiceClient; // Create connection object configured to run TLS if schema starts with https:// // and plain text otherwise. Connection is lazy, only endpoint sanity is // validated here. +// +// NB: this function is not async, but still must be run on a tokio runtime thread +// because that's a requirement of tonic_endpoint.connect_lazy()'s Channel::new call. pub fn connect(endpoint: U, keepalive_interval: Duration) -> anyhow::Result where U: std::convert::TryInto, @@ -55,7 +59,8 @@ where } tonic_endpoint = tonic_endpoint .http2_keep_alive_interval(keepalive_interval) - .keep_alive_while_idle(true); + .keep_alive_while_idle(true) + .connect_timeout(DEFAULT_CONNECT_TIMEOUT); // keep_alive_timeout is 20s by default on both client and server side let channel = tonic_endpoint.connect_lazy(); Ok(BrokerClientChannel::new(channel)) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index f0d9ce4af2..a10ef70aa2 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -312,6 +312,6 @@ def neon_with_baseline(request: FixtureRequest) -> PgCompare: implementation-specific logic is widely useful across multiple tests, it might make sense to add methods to the PgCompare class. """ - fixture = request.getfixturevalue(request.param) # type: ignore + fixture = request.getfixturevalue(request.param) assert isinstance(fixture, PgCompare), f"test error: fixture {fixture} is not PgCompare" return fixture diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 0e958ddd06..b4c237cfa6 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -65,12 +65,19 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_getpage_reconstruct_seconds_bucket", "pageserver_getpage_reconstruct_seconds_count", "pageserver_getpage_reconstruct_seconds_sum", + "pageserver_getpage_get_reconstruct_data_seconds_bucket", + "pageserver_getpage_get_reconstruct_data_seconds_count", + "pageserver_getpage_get_reconstruct_data_seconds_sum", "pageserver_io_operations_bytes_total", "pageserver_io_operations_seconds_bucket", "pageserver_io_operations_seconds_count", "pageserver_io_operations_seconds_sum", "pageserver_last_record_lsn", "pageserver_materialized_cache_hits_total", + "pageserver_materialized_cache_hits_direct_total", + "pageserver_read_num_fs_layers_bucket", + "pageserver_read_num_fs_layers_count", + "pageserver_read_num_fs_layers_sum", "pageserver_smgr_query_seconds_bucket", "pageserver_smgr_query_seconds_count", "pageserver_smgr_query_seconds_sum", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index bde91e6783..a8610e24df 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -26,7 +26,7 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast from urllib.parse import urlparse import asyncpg -import backoff # type: ignore +import backoff import boto3 import jwt import psycopg2 @@ -354,7 +354,7 @@ class PgProtocol: Returns psycopg2's connection object. This method passes all extra params to connstr. """ - conn = psycopg2.connect(**self.conn_options(**kwargs)) + conn: PgConnection = psycopg2.connect(**self.conn_options(**kwargs)) # WARNING: this setting affects *all* tests! conn.autocommit = autocommit @@ -629,7 +629,7 @@ class NeonEnvBuilder: assert self.env is not None, "environment is not already initialized, call init() first" self.env.start() - def init_start(self) -> NeonEnv: + def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> NeonEnv: env = self.init_configs() self.start() @@ -638,7 +638,9 @@ class NeonEnvBuilder: log.info( f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline" ) - initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant) + initial_tenant, initial_timeline = env.neon_cli.create_tenant( + tenant_id=env.initial_tenant, conf=initial_tenant_conf + ) env.initial_timeline = initial_timeline log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully") @@ -661,6 +663,8 @@ class NeonEnvBuilder: else: raise RuntimeError(f"Unknown storage type: {remote_storage_kind}") + self.remote_storage_kind = remote_storage_kind + def enable_local_fs_remote_storage(self, force_enable: bool = True): """ Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path. @@ -1444,11 +1448,12 @@ class NeonCli(AbstractNeonCli): def endpoint_create( self, branch_name: str, + pg_port: int, + http_port: int, endpoint_id: Optional[str] = None, tenant_id: Optional[TenantId] = None, hot_standby: bool = False, lsn: Optional[Lsn] = None, - port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1462,8 +1467,10 @@ class NeonCli(AbstractNeonCli): ] if lsn is not None: args.extend(["--lsn", str(lsn)]) - if port is not None: - args.extend(["--port", str(port)]) + if pg_port is not None: + args.extend(["--pg-port", str(pg_port)]) + if http_port is not None: + args.extend(["--http-port", str(http_port)]) if endpoint_id is not None: args.append(endpoint_id) if hot_standby: @@ -1476,9 +1483,11 @@ class NeonCli(AbstractNeonCli): def endpoint_start( self, endpoint_id: str, + pg_port: int, + http_port: int, + safekeepers: Optional[List[int]] = None, tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, - port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1490,8 +1499,10 @@ class NeonCli(AbstractNeonCli): ] if lsn is not None: args.append(f"--lsn={lsn}") - if port is not None: - args.append(f"--port={port}") + args.extend(["--pg-port", str(pg_port)]) + args.extend(["--http-port", str(http_port)]) + if safekeepers is not None: + args.extend(["--safekeepers", (",".join(map(str, safekeepers)))]) if endpoint_id is not None: args.append(endpoint_id) @@ -1583,13 +1594,11 @@ class NeonPageserver(PgProtocol): ".*serving compute connection task.*exited with error: Postgres connection error.*", ".*serving compute connection task.*exited with error: Connection reset by peer.*", ".*serving compute connection task.*exited with error: Postgres query error.*", - ".*Connection aborted: connection error: error communicating with the server: Broken pipe.*", - ".*Connection aborted: connection error: error communicating with the server: Transport endpoint is not connected.*", - ".*Connection aborted: connection error: error communicating with the server: Connection reset by peer.*", + ".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*", # FIXME: replication patch for tokio_postgres regards any but CopyDone/CopyData message in CopyBoth stream as unexpected - ".*Connection aborted: connection error: unexpected message from server*", + ".*Connection aborted: unexpected message from server*", ".*kill_and_wait_impl.*: wait successful.*", - ".*Replication stream finished: db error:.*ending streaming to Some*", + ".*: db error:.*ending streaming to Some.*", ".*query handler for 'pagestream.*failed: Broken pipe.*", # pageserver notices compute shut down ".*query handler for 'pagestream.*failed: Connection reset by peer.*", # pageserver notices compute shut down # safekeeper connection can fail with this, in the window between timeline creation @@ -1603,24 +1612,25 @@ class NeonPageserver(PgProtocol): # https://github.com/neondatabase/neon/issues/2442 ".*could not remove ephemeral file.*No such file or directory.*", # FIXME: These need investigation - ".*gc_loop.*Failed to get a tenant .* Tenant .* not found.*", - ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found.*", ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*", ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*", ".*Removing intermediate uninit mark file.*", - # FIXME: known race condition in TaskHandle: https://github.com/neondatabase/neon/issues/2885 - ".*sender is dropped while join handle is still alive.*", # Tenant::delete_timeline() can cause any of the four following errors. # FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946 ".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed ".*wait for layer upload ops to complete.*", # .*Caused by:.*wait_completion aborted because upload queue was stopped ".*gc_loop.*Gc failed, retrying in.*timeline is Stopping", # When gc checks timeline state after acquiring layer_removal_cs + ".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant", # Tenant::gc precondition ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs ".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock() ".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress ".*task iteration took longer than the configured period.*", # this is until #3501 ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant", + # these can happen anytime we do compactions from background task and shutdown pageserver + r".*ERROR.*ancestor timeline \S+ is being stopped", + # this is expected given our collaborative shutdown approach for the UploadQueue + ".*Compaction failed, retrying in .*: queue is in state Stopped.*", ] def start( @@ -1688,6 +1698,9 @@ class NeonPageserver(PgProtocol): else: errors.append(line) + for error in errors: + log.info(f"not allowed error: {error.strip()}") + assert not errors def log_contains(self, pattern: str) -> Optional[str]: @@ -2280,17 +2293,24 @@ class Endpoint(PgProtocol): """An object representing a Postgres compute endpoint managed by the control plane.""" def __init__( - self, env: NeonEnv, tenant_id: TenantId, port: int, check_stop_result: bool = True + self, + env: NeonEnv, + tenant_id: TenantId, + pg_port: int, + http_port: int, + check_stop_result: bool = True, ): - super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres") + super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres") self.env = env self.running = False self.branch_name: Optional[str] = None # dubious self.endpoint_id: Optional[str] = None # dubious, see asserts below self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA self.tenant_id = tenant_id - self.port = port + self.pg_port = pg_port + self.http_port = http_port self.check_stop_result = check_stop_result + self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers)) # path to conf is /endpoints//pgdata/postgresql.conf def create( @@ -2320,7 +2340,8 @@ class Endpoint(PgProtocol): tenant_id=self.tenant_id, lsn=lsn, hot_standby=hot_standby, - port=self.port, + pg_port=self.pg_port, + http_port=self.http_port, ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = os.path.join(self.env.repo_dir, path) @@ -2345,7 +2366,13 @@ class Endpoint(PgProtocol): log.info(f"Starting postgres endpoint {self.endpoint_id}") - self.env.neon_cli.endpoint_start(self.endpoint_id, tenant_id=self.tenant_id, port=self.port) + self.env.neon_cli.endpoint_start( + self.endpoint_id, + pg_port=self.pg_port, + http_port=self.http_port, + tenant_id=self.tenant_id, + safekeepers=self.active_safekeepers, + ) self.running = True return self @@ -2369,32 +2396,8 @@ class Endpoint(PgProtocol): return os.path.join(self.pg_data_dir_path(), "pg_twophase") def config_file_path(self) -> str: - """Path to postgresql.conf""" - return os.path.join(self.pg_data_dir_path(), "postgresql.conf") - - def adjust_for_safekeepers(self, safekeepers: str) -> "Endpoint": - """ - Adjust instance config for working with wal acceptors instead of - pageserver (pre-configured by CLI) directly. - """ - - # TODO: reuse config() - with open(self.config_file_path(), "r") as f: - cfg_lines = f.readlines() - with open(self.config_file_path(), "w") as f: - for cfg_line in cfg_lines: - # walproposer uses different application_name - if ( - "synchronous_standby_names" in cfg_line - or - # don't repeat safekeepers/wal_acceptors multiple times - "neon.safekeepers" in cfg_line - ): - continue - f.write(cfg_line) - f.write("synchronous_standby_names = 'walproposer'\n") - f.write("neon.safekeepers = '{}'\n".format(safekeepers)) - return self + """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)""" + return os.path.join(self.endpoint_path(), "postgresql.conf") def config(self, lines: List[str]) -> "Endpoint": """ @@ -2499,7 +2502,8 @@ class EndpointFactory: ep = Endpoint( self.env, tenant_id=tenant_id or self.env.initial_tenant, - port=self.env.port_distributor.get_port(), + pg_port=self.env.port_distributor.get_port(), + http_port=self.env.port_distributor.get_port(), ) self.num_instances += 1 self.endpoints.append(ep) @@ -2524,7 +2528,8 @@ class EndpointFactory: ep = Endpoint( self.env, tenant_id=tenant_id or self.env.initial_tenant, - port=self.env.port_distributor.get_port(), + pg_port=self.env.port_distributor.get_port(), + http_port=self.env.port_distributor.get_port(), ) if endpoint_id is None: @@ -2907,6 +2912,7 @@ SKIP_FILES = frozenset( "pg_internal.init", "pg.log", "zenith.signal", + "pg_hba.conf", "postgresql.conf", "postmaster.opts", "postmaster.pid", diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 2b8271958f..f258a3a24d 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import time from collections import defaultdict from dataclasses import dataclass @@ -109,6 +110,10 @@ class PageserverHttpClient(requests.Session): if auth_token is not None: self.headers["Authorization"] = f"Bearer {auth_token}" + @property + def base_url(self) -> str: + return f"http://localhost:{self.port}" + def verbose_error(self, res: requests.Response): try: res.raise_for_status() @@ -157,7 +162,7 @@ class PageserverHttpClient(requests.Session): res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ - "new_tenant_id": str(new_tenant_id) if new_tenant_id else None, + "new_tenant_id": str(new_tenant_id), **(conf or {}), }, ) @@ -168,8 +173,22 @@ class PageserverHttpClient(requests.Session): assert isinstance(new_tenant_id, str) return TenantId(new_tenant_id) - def tenant_attach(self, tenant_id: TenantId): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach") + def tenant_attach( + self, tenant_id: TenantId, config: None | Dict[str, Any] = None, config_null: bool = False + ): + if config_null: + assert config is None + body = "null" + else: + # null-config is prohibited by the API + if config is None: + config = {} + body = json.dumps({"config": config}) + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach", + data=body, + headers={"Content-Type": "application/json"}, + ) self.verbose_error(res) def tenant_detach(self, tenant_id: TenantId, detach_ignored=False): @@ -274,13 +293,13 @@ class PageserverHttpClient(requests.Session): self, pg_version: PgVersion, tenant_id: TenantId, - new_timeline_id: Optional[TimelineId] = None, + new_timeline_id: TimelineId, ancestor_timeline_id: Optional[TimelineId] = None, ancestor_start_lsn: Optional[Lsn] = None, **kwargs, ) -> Dict[Any, Any]: body: Dict[str, Any] = { - "new_timeline_id": str(new_timeline_id) if new_timeline_id else None, + "new_timeline_id": str(new_timeline_id), "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, } diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index c558387413..83880abc77 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -1,8 +1,8 @@ import time -from typing import Optional +from typing import Any, Dict, Optional from fixtures.log_helper import log -from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.types import Lsn, TenantId, TimelineId @@ -72,7 +72,7 @@ def wait_until_tenant_state( expected_state: str, iterations: int, period: float = 1.0, -) -> bool: +) -> Dict[str, Any]: """ Does not use `wait_until` for debugging purposes """ @@ -81,7 +81,7 @@ def wait_until_tenant_state( tenant = pageserver_http.tenant_status(tenant_id=tenant_id) log.debug(f"Tenant {tenant_id} data: {tenant}") if tenant["state"]["slug"] == expected_state: - return True + return tenant except Exception as e: log.debug(f"Tenant {tenant_id} state retrieval failure: {e}") @@ -92,6 +92,41 @@ def wait_until_tenant_state( ) +def wait_until_timeline_state( + pageserver_http: PageserverHttpClient, + tenant_id: TenantId, + timeline_id: TimelineId, + expected_state: str, + iterations: int, + period: float = 1.0, +) -> Dict[str, Any]: + """ + Does not use `wait_until` for debugging purposes + """ + for i in range(iterations): + try: + timeline = pageserver_http.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id) + log.debug(f"Timeline {tenant_id}/{timeline_id} data: {timeline}") + if isinstance(timeline["state"], str): + if timeline["state"] == expected_state: + return timeline + elif isinstance(timeline, Dict): + if timeline["state"].get(expected_state): + return timeline + + except Exception as e: + log.debug(f"Timeline {tenant_id}/{timeline_id} state retrieval failure: {e}") + + if i == iterations - 1: + # do not sleep last time, we already know that we failed + break + time.sleep(period) + + raise Exception( + f"Timeline {tenant_id}/{timeline_id} did not become {expected_state} within {iterations * period} seconds" + ) + + def wait_until_tenant_active( pageserver_http: PageserverHttpClient, tenant_id: TenantId, @@ -156,3 +191,21 @@ def wait_for_upload_queue_empty( if all(m.value == 0 for m in tl): return time.sleep(0.2) + + +def assert_timeline_detail_404( + pageserver_http: PageserverHttpClient, + tenant_id: TenantId, + timeline_id: TimelineId, +): + """Asserts that timeline_detail returns 404, or dumps the detail.""" + try: + data = pageserver_http.timeline_detail(tenant_id, timeline_id) + log.error(f"detail {data}") + except PageserverApiException as e: + log.error(e) + if e.status_code == 404: + return + else: + raise + raise Exception("detail succeeded (it should return 404)") diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index d67f088365..14ae88cc2c 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -27,6 +27,10 @@ class PgVersion(str, enum.Enum): def __repr__(self) -> str: return f"'{self.value}'" + # Make this explicit for Python 3.11 compatibility, which changes the behavior of enums + def __str__(self) -> str: + return self.value + # In GitHub workflows we use Postgres version with v-prefix (e.g. v14 instead of just 14), # sometime we need to do so in tests. @property @@ -78,11 +82,11 @@ def pytest_addoption(parser: Parser): @pytest.fixture(scope="session") def pg_version(request: FixtureRequest) -> Iterator[PgVersion]: if v := request.config.getoption("--pg-version"): - version, source = v, "from --pg-version commad-line argument" + version, source = v, "from --pg-version command-line argument" elif v := os.environ.get("DEFAULT_PG_VERSION"): version, source = PgVersion(v), "from DEFAULT_PG_VERSION environment variable" else: - version, source = DEFAULT_VERSION, "default verson" + version, source = DEFAULT_VERSION, "default version" log.info(f"pg_version is {version} ({source})") yield version diff --git a/test_runner/performance/test_dup_key.py b/test_runner/performance/test_dup_key.py index 81752ae740..60a4d91313 100644 --- a/test_runner/performance/test_dup_key.py +++ b/test_runner/performance/test_dup_key.py @@ -2,7 +2,7 @@ from contextlib import closing import pytest from fixtures.compare_fixtures import PgCompare -from pytest_lazyfixture import lazy_fixture # type: ignore +from pytest_lazyfixture import lazy_fixture @pytest.mark.parametrize( diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py new file mode 100644 index 0000000000..f93b560d8e --- /dev/null +++ b/test_runner/performance/test_gc_feedback.py @@ -0,0 +1,76 @@ +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +@pytest.mark.timeout(10000) +def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): + """ + Test that GC is able to collect all old layers even if them are forming + "stairs" and there are not three delta layers since last image layer. + + Information about image layers needed to collect old layers should + be propagated by GC to compaction task which should take in in account + when make a decision which new image layers needs to be created. + """ + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + + tenant_id, _ = env.neon_cli.create_tenant( + conf={ + # disable default GC and compaction + "gc_period": "1000 m", + "compaction_period": "0 s", + "gc_horizon": f"{1024 ** 2}", + "checkpoint_distance": f"{1024 ** 2}", + "compaction_target_size": f"{1024 ** 2}", + # set PITR interval to be small, so we can do GC + "pitr_interval": "10 s", + # "compaction_threshold": "3", + # "image_creation_threshold": "2", + } + ) + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0] + n_steps = 10 + n_update_iters = 100 + step_size = 10000 + with endpoint.cursor() as cur: + cur.execute("SET statement_timeout='1000s'") + cur.execute( + "CREATE TABLE t(step bigint, count bigint default 0, payload text default repeat(' ', 100)) with (fillfactor=50)" + ) + cur.execute("CREATE INDEX ON t(step)") + # In each step, we insert 'step_size' new rows, and update the newly inserted rows + # 'n_update_iters' times. This creates a lot of churn and generates lots of WAL at the end of the table, + # without modifying the earlier parts of the table. + for step in range(n_steps): + cur.execute(f"INSERT INTO t (step) SELECT {step} FROM generate_series(1, {step_size})") + for i in range(n_update_iters): + cur.execute(f"UPDATE t set count=count+1 where step = {step}") + cur.execute("vacuum t") + + # cur.execute("select pg_table_size('t')") + # logical_size = cur.fetchone()[0] + logical_size = client.timeline_detail(tenant_id, timeline_id)["current_logical_size"] + log.info(f"Logical storage size {logical_size}") + + client.timeline_checkpoint(tenant_id, timeline_id) + + # Do compaction and GC + client.timeline_gc(tenant_id, timeline_id, 0) + client.timeline_compact(tenant_id, timeline_id) + # One more iteration to check that no excessive image layers are generated + client.timeline_gc(tenant_id, timeline_id, 0) + client.timeline_compact(tenant_id, timeline_id) + + physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"] + log.info(f"Physical storage size {physical_size}") + + MB = 1024 * 1024 + zenbenchmark.record("logical_size", logical_size // MB, "Mb", MetricReport.LOWER_IS_BETTER) + zenbenchmark.record("physical_size", physical_size // MB, "Mb", MetricReport.LOWER_IS_BETTER) + zenbenchmark.record( + "physical/logical ratio", physical_size / logical_size, "", MetricReport.LOWER_IS_BETTER + ) diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py index aad6ee667a..d9785dd87e 100644 --- a/test_runner/performance/test_hot_page.py +++ b/test_runner/performance/test_hot_page.py @@ -2,7 +2,7 @@ from contextlib import closing import pytest from fixtures.compare_fixtures import PgCompare -from pytest_lazyfixture import lazy_fixture # type: ignore +from pytest_lazyfixture import lazy_fixture @pytest.mark.parametrize( diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py index 2f519e152c..a133aca8ce 100644 --- a/test_runner/performance/test_hot_table.py +++ b/test_runner/performance/test_hot_table.py @@ -2,7 +2,7 @@ from contextlib import closing import pytest from fixtures.compare_fixtures import PgCompare -from pytest_lazyfixture import lazy_fixture # type: ignore +from pytest_lazyfixture import lazy_fixture @pytest.mark.parametrize( diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py index bd84724405..409b30a909 100644 --- a/test_runner/performance/test_seqscans.py +++ b/test_runner/performance/test_seqscans.py @@ -6,7 +6,7 @@ import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.compare_fixtures import PgCompare from fixtures.log_helper import log -from pytest_lazyfixture import lazy_fixture # type: ignore +from pytest_lazyfixture import lazy_fixture @pytest.mark.parametrize( diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index fa2e058491..9c45088d62 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -1,10 +1,63 @@ from contextlib import closing import pytest -from fixtures.benchmark_fixture import NeonBenchmarker +import requests +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.neon_fixtures import NeonEnvBuilder +# Just start and measure duration. +# +# This test runs pretty quickly and can be informative when used in combination +# with emulated network delay. Some useful delay commands: +# +# 1. Add 2msec delay to all localhost traffic +# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec` +# +# 2. Test that it works (you should see 4ms ping) +# `ping localhost` +# +# 3. Revert back to normal +# `sudo tc qdisc del dev lo root netem` +# +# NOTE this test might not represent the real startup time because the basebackup +# for a large database might be larger if there's a lof of transaction metadata, +# or safekeepers might need more syncing, or there might be more operations to +# apply during config step, like more users, databases, or extensions. By default +# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this +# test we only load neon. +def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_startup") + + # We do two iterations so we can see if the second startup is faster. It should + # be because the compute node should already be configured with roles, databases, + # extensions, etc from the first run. + for i in range(2): + # Start + with zenbenchmark.record_duration(f"{i}_start_and_select"): + endpoint = env.endpoints.create_start("test_startup") + endpoint.safe_psql("select 1;") + + # Get metrics + metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json() + durations = { + "wait_for_spec_ms": f"{i}_wait_for_spec", + "sync_safekeepers_ms": f"{i}_sync_safekeepers", + "basebackup_ms": f"{i}_basebackup", + "config_ms": f"{i}_config", + "total_startup_ms": f"{i}_total_startup", + } + for key, name in durations.items(): + value = metrics[key] + zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER) + + # Stop so we can restart + endpoint.stop() + + # This test sometimes runs for longer than the global 5 minute timeout. @pytest.mark.timeout(600) def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py new file mode 100644 index 0000000000..4df5ae18d6 --- /dev/null +++ b/test_runner/regress/test_attach_tenant_config.py @@ -0,0 +1,206 @@ +from dataclasses import dataclass +from typing import Generator, Optional + +import pytest +from fixtures.neon_fixtures import ( + LocalFsStorage, + NeonEnv, + NeonEnvBuilder, + RemoteStorageKind, +) +from fixtures.pageserver.http import PageserverApiException, TenantConfig +from fixtures.types import TenantId +from fixtures.utils import wait_until + + +@pytest.fixture +def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv: + neon_env_builder.enable_remote_storage( + remote_storage_kind=RemoteStorageKind.LOCAL_FS, + test_name="test_attach_tenant_config", + ) + env = neon_env_builder.init_start() + + # eviction might be the first one after an attach to access the layers + env.pageserver.allowed_errors.append( + ".*unexpectedly on-demand downloading remote layer remote.* for task kind Eviction" + ) + assert isinstance(env.remote_storage, LocalFsStorage) + return env + + +@dataclass +class NegativeTests: + neon_env: NeonEnv + tenant_id: TenantId + config_pre_detach: TenantConfig + + +@pytest.fixture +def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, None, None]: + neon_env_builder.enable_remote_storage( + remote_storage_kind=RemoteStorageKind.LOCAL_FS, + test_name="test_attach_tenant_config", + ) + env = neon_env_builder.init_start() + assert isinstance(env.remote_storage, LocalFsStorage) + + ps_http = env.pageserver.http_client() + (tenant_id, _) = env.neon_cli.create_tenant() + assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == {} + config_pre_detach = ps_http.tenant_config(tenant_id) + assert tenant_id in [TenantId(t["id"]) for t in ps_http.tenant_list()] + ps_http.tenant_detach(tenant_id) + assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()] + + yield NegativeTests(env, tenant_id, config_pre_detach) + + assert tenant_id not in [ + TenantId(t["id"]) for t in ps_http.tenant_list() + ], "tenant should not be attached after negative test" + + env.pageserver.allowed_errors.append(".*Error processing HTTP request: Bad request") + + def log_contains_bad_request(): + env.pageserver.log_contains(".*Error processing HTTP request: Bad request") + + wait_until(50, 0.1, log_contains_bad_request) + + +def test_null_body(negative_env: NegativeTests): + """ + If we send `null` in the body, the request should be rejected with status 400. + """ + env = negative_env.neon_env + tenant_id = negative_env.tenant_id + ps_http = env.pageserver.http_client() + + res = ps_http.post( + f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", + data=b"null", + headers={"Content-Type": "application/json"}, + ) + assert res.status_code == 400 + + +def test_null_config(negative_env: NegativeTests): + """ + If the `config` field is `null`, the request should be rejected with status 400. + """ + + env = negative_env.neon_env + tenant_id = negative_env.tenant_id + ps_http = env.pageserver.http_client() + + res = ps_http.post( + f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", + data=b'{"config": null}', + headers={"Content-Type": "application/json"}, + ) + assert res.status_code == 400 + + +def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests): + """ + If we send a config with unknown keys, the request should be rejected with status 400. + """ + + env = negative_env.neon_env + tenant_id = negative_env.tenant_id + ps_http = env.pageserver.http_client() + + config_with_unknown_keys = { + "compaction_period": "1h", + "this_key_does_not_exist": "some value", + } + + with pytest.raises(PageserverApiException) as e: + ps_http.tenant_attach(tenant_id, config=config_with_unknown_keys) + assert e.type == PageserverApiException + assert e.value.status_code == 400 + + +@pytest.mark.parametrize("content_type", [None, "application/json"]) +def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]): + """ + For backwards-compatiblity: if we send an empty body, + the request should be accepted and the config should be the default config. + """ + env = positive_env + ps_http = env.pageserver.http_client() + (tenant_id, _) = env.neon_cli.create_tenant() + assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == {} + config_pre_detach = ps_http.tenant_config(tenant_id) + assert tenant_id in [TenantId(t["id"]) for t in ps_http.tenant_list()] + ps_http.tenant_detach(tenant_id) + assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()] + + ps_http.post( + f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", + data=b"", + headers=None if content_type else {"Content-Type": "application/json"}, + ).raise_for_status() + + assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == {} + assert ps_http.tenant_config(tenant_id).effective_config == config_pre_detach.effective_config + + +def test_fully_custom_config(positive_env: NeonEnv): + """ + If we send a valid config in the body, the request should be accepted and the config should be applied. + """ + env = positive_env + + fully_custom_config = { + "compaction_period": "1h", + "compaction_threshold": 13, + "compaction_target_size": 1048576, + "checkpoint_distance": 10000, + "checkpoint_timeout": "13m", + "eviction_policy": { + "kind": "LayerAccessThreshold", + "period": "20s", + "threshold": "23h", + }, + "evictions_low_residence_duration_metric_threshold": "2days", + "gc_feedback": True, + "gc_horizon": 23 * (1024 * 1024), + "gc_period": "2h 13m", + "image_creation_threshold": 7, + "pitr_interval": "1m", + "lagging_wal_timeout": "23m", + "max_lsn_wal_lag": 230000, + "min_resident_size_override": 23, + "trace_read_requests": True, + "walreceiver_connect_timeout": "13m", + } + + ps_http = env.pageserver.http_client() + + initial_tenant_config = ps_http.tenant_config(env.initial_tenant) + assert initial_tenant_config.tenant_specific_overrides == {} + assert set(initial_tenant_config.effective_config.keys()) == set( + fully_custom_config.keys() + ), "ensure we cover all config options" + + (tenant_id, _) = env.neon_cli.create_tenant() + ps_http.set_tenant_config(tenant_id, fully_custom_config) + our_tenant_config = ps_http.tenant_config(tenant_id) + assert our_tenant_config.tenant_specific_overrides == fully_custom_config + assert set(our_tenant_config.effective_config.keys()) == set( + fully_custom_config.keys() + ), "ensure we cover all config options" + assert { + k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k] + for k in fully_custom_config.keys() + } == { + k: True for k in fully_custom_config.keys() + }, "ensure our custom config has different values than the default config for all config options, so we know we overrode everything" + + ps_http.tenant_detach(tenant_id) + ps_http.tenant_attach(tenant_id, config=fully_custom_config) + + assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == fully_custom_config + assert set(ps_http.tenant_config(tenant_id).effective_config.keys()) == set( + fully_custom_config.keys() + ), "ensure we cover all config options" diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 38a1cedf33..a66b40a8ec 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -3,7 +3,7 @@ from contextlib import closing import pytest from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol from fixtures.pageserver.http import PageserverApiException -from fixtures.types import TenantId +from fixtures.types import TenantId, TimelineId def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): @@ -25,21 +25,19 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): ps.safe_psql("set FOO", password=tenant_token) ps.safe_psql("set FOO", password=pageserver_token) - new_timeline_id = env.neon_cli.create_branch( - "test_pageserver_auth", tenant_id=env.initial_tenant - ) - # tenant can create branches tenant_http_client.timeline_create( pg_version=env.pg_version, tenant_id=env.initial_tenant, - ancestor_timeline_id=new_timeline_id, + new_timeline_id=TimelineId.generate(), + ancestor_timeline_id=env.initial_timeline, ) # console can create branches for tenant pageserver_http_client.timeline_create( pg_version=env.pg_version, tenant_id=env.initial_tenant, - ancestor_timeline_id=new_timeline_id, + new_timeline_id=TimelineId.generate(), + ancestor_timeline_id=env.initial_timeline, ) # fail to create branch using token with different tenant_id @@ -49,7 +47,8 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): invalid_tenant_http_client.timeline_create( pg_version=env.pg_version, tenant_id=env.initial_tenant, - ancestor_timeline_id=new_timeline_id, + new_timeline_id=TimelineId.generate(), + ancestor_timeline_id=env.initial_timeline, ) # create tenant using management token diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index fb592bfbc3..0fb3b4f262 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -20,7 +20,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", ".*failed to load metadata.*", - ".*could not load tenant.*load local timeline.*", + ".*load failed.*load local timeline.*", ] ) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index fe8dc293c1..2635dbd93c 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -383,6 +383,9 @@ def check_neon_works( cli_target = NeonCli(config_target) # And the current binaries to launch computes + snapshot_config["neon_distrib_dir"] = str(neon_current_binpath) + with (snapshot_config_toml).open("w") as f: + toml.dump(snapshot_config, f) config_current = copy.copy(config) config_current.neon_binpath = neon_current_binpath cli_current = NeonCli(config_current) @@ -391,7 +394,8 @@ def check_neon_works( request.addfinalizer(lambda: cli_target.raw_cli(["stop"])) pg_port = port_distributor.get_port() - cli_current.endpoint_start("main", port=pg_port) + http_port = port_distributor.get_port() + cli_current.endpoint_start("main", pg_port=pg_port, http_port=http_port) request.addfinalizer(lambda: cli_current.endpoint_stop("main")) connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres" diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py deleted file mode 100644 index d72ffe078d..0000000000 --- a/test_runner/regress/test_compute_ctl.py +++ /dev/null @@ -1,253 +0,0 @@ -import os -from pathlib import Path -from subprocess import TimeoutExpired - -from fixtures.log_helper import log -from fixtures.neon_fixtures import ComputeCtl, NeonEnvBuilder, PgBin - - -# Test that compute_ctl works and prints "--sync-safekeepers" logs. -def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - ctl = ComputeCtl(env) - - env.neon_cli.create_branch("test_compute_ctl", "main") - endpoint = env.endpoints.create_start("test_compute_ctl") - endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)") - - with open(endpoint.config_file_path(), "r") as f: - cfg_lines = f.readlines() - cfg_map = {} - for line in cfg_lines: - if "=" in line: - k, v = line.split("=") - cfg_map[k] = v.strip("\n '\"") - log.info(f"postgres config: {cfg_map}") - pgdata = endpoint.pg_data_dir_path() - pg_bin_path = os.path.join(pg_bin.pg_bin_path, "postgres") - - endpoint.stop_and_destroy() - - # stop_and_destroy removes the whole endpoint directory. Recreate it. - Path(pgdata).mkdir(parents=True) - - spec = ( - """ -{ - "format_version": 1.0, - - "timestamp": "2021-05-23T18:25:43.511Z", - "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b", - - "cluster": { - "cluster_id": "test-cluster-42", - "name": "Neon Test", - "state": "restarted", - "roles": [ - ], - "databases": [ - ], - "settings": [ - { - "name": "fsync", - "value": "off", - "vartype": "bool" - }, - { - "name": "wal_level", - "value": "replica", - "vartype": "enum" - }, - { - "name": "neon.safekeepers", - "value": """ - + f'"{cfg_map["neon.safekeepers"]}"' - + """, - "vartype": "string" - }, - { - "name": "wal_log_hints", - "value": "on", - "vartype": "bool" - }, - { - "name": "log_connections", - "value": "on", - "vartype": "bool" - }, - { - "name": "shared_buffers", - "value": "32768", - "vartype": "integer" - }, - { - "name": "port", - "value": """ - + f'"{cfg_map["port"]}"' - + """, - "vartype": "integer" - }, - { - "name": "max_connections", - "value": "100", - "vartype": "integer" - }, - { - "name": "max_wal_senders", - "value": "10", - "vartype": "integer" - }, - { - "name": "listen_addresses", - "value": "0.0.0.0", - "vartype": "string" - }, - { - "name": "wal_sender_timeout", - "value": "0", - "vartype": "integer" - }, - { - "name": "password_encryption", - "value": "md5", - "vartype": "enum" - }, - { - "name": "maintenance_work_mem", - "value": "65536", - "vartype": "integer" - }, - { - "name": "max_parallel_workers", - "value": "8", - "vartype": "integer" - }, - { - "name": "max_worker_processes", - "value": "8", - "vartype": "integer" - }, - { - "name": "neon.tenant_id", - "value": """ - + f'"{cfg_map["neon.tenant_id"]}"' - + """, - "vartype": "string" - }, - { - "name": "max_replication_slots", - "value": "10", - "vartype": "integer" - }, - { - "name": "neon.timeline_id", - "value": """ - + f'"{cfg_map["neon.timeline_id"]}"' - + """, - "vartype": "string" - }, - { - "name": "shared_preload_libraries", - "value": "neon", - "vartype": "string" - }, - { - "name": "synchronous_standby_names", - "value": "walproposer", - "vartype": "string" - }, - { - "name": "neon.pageserver_connstring", - "value": """ - + f'"{cfg_map["neon.pageserver_connstring"]}"' - + """, - "vartype": "string" - } - ] - }, - "delta_operations": [ - ] -} -""" - ) - - ps_connstr = cfg_map["neon.pageserver_connstring"] - log.info(f"ps_connstr: {ps_connstr}, pgdata: {pgdata}") - - # run compute_ctl and wait for 10s - try: - ctl.raw_cli( - [ - "--connstr", - "postgres://invalid/", - "--pgdata", - pgdata, - "--spec", - spec, - "--pgbin", - pg_bin_path, - ], - timeout=10, - ) - except TimeoutExpired as exc: - ctl_logs = (exc.stderr or b"").decode("utf-8") - log.info(f"compute_ctl stderr:\n{ctl_logs}") - - with ExternalProcessManager(Path(pgdata) / "postmaster.pid"): - start = "starting safekeepers syncing" - end = "safekeepers synced at LSN" - start_pos = ctl_logs.index(start) - assert start_pos != -1 - end_pos = ctl_logs.index(end, start_pos) - assert end_pos != -1 - sync_safekeepers_logs = ctl_logs[start_pos : end_pos + len(end)] - log.info("sync_safekeepers_logs:\n" + sync_safekeepers_logs) - - # assert that --sync-safekeepers logs are present in the output - assert "connecting with node" in sync_safekeepers_logs - assert "connected with node" in sync_safekeepers_logs - assert "proposer connected to quorum (2)" in sync_safekeepers_logs - assert "got votes from majority (2)" in sync_safekeepers_logs - assert "sending elected msg to node" in sync_safekeepers_logs - - -class ExternalProcessManager: - """ - Context manager that kills a process with a pid file on exit. - """ - - def __init__(self, pid_file: Path): - self.path = pid_file - self.pid_file = open(pid_file, "r") - self.pid = int(self.pid_file.readline().strip()) - - def __enter__(self): - return self - - def leave_alive(self): - self.pid_file.close() - - def __exit__(self, _type, _value, _traceback): - import signal - import time - - if self.pid_file.closed: - return - - with self.pid_file: - try: - os.kill(self.pid, signal.SIGTERM) - except OSError as e: - if not self.path.is_file(): - return - log.info(f"Failed to kill {self.pid}, but the pidfile remains: {e}") - return - - for _ in range(20): - if not self.path.is_file(): - return - time.sleep(0.2) - - log.info("Process failed to stop after SIGTERM: {self.pid}") - os.kill(self.pid, signal.SIGKILL) diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py new file mode 100644 index 0000000000..6bfa8fdbe7 --- /dev/null +++ b/test_runner/regress/test_ddl_forwarding.py @@ -0,0 +1,210 @@ +from types import TracebackType +from typing import Any, Dict, List, Optional, Tuple, Type + +import psycopg2 +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import VanillaPostgres +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + + +def handle_db(dbs, roles, operation): + if operation["op"] == "set": + if "old_name" in operation and operation["old_name"] in dbs: + dbs[operation["name"]] = dbs[operation["old_name"]] + dbs.pop(operation["old_name"]) + if "owner" in operation: + dbs[operation["name"]] = operation["owner"] + elif operation["op"] == "del": + dbs.pop(operation["name"]) + else: + raise ValueError("Invalid op") + + +def handle_role(dbs, roles, operation): + if operation["op"] == "set": + if "old_name" in operation and operation["old_name"] in roles: + roles[operation["name"]] = roles[operation["old_name"]] + roles.pop(operation["old_name"]) + for db, owner in dbs.items(): + if owner == operation["old_name"]: + dbs[db] = operation["name"] + if "password" in operation: + roles[operation["name"]] = operation["password"] + elif operation["op"] == "del": + if "old_name" in operation: + roles.pop(operation["old_name"]) + roles.pop(operation["name"]) + else: + raise ValueError("Invalid op") + + +fail = False + + +def ddl_forward_handler(request: Request, dbs: Dict[str, str], roles: Dict[str, str]) -> Response: + log.info(f"Received request with data {request.get_data(as_text=True)}") + if fail: + log.info("FAILING") + return Response(status=500, response="Failed just cuz") + if request.json is None: + log.info("Received invalid JSON") + return Response(status=400) + json = request.json + # Handle roles first + if "roles" in json: + for operation in json["roles"]: + handle_role(dbs, roles, operation) + if "dbs" in json: + for operation in json["dbs"]: + handle_db(dbs, roles, operation) + return Response(status=200) + + +class DdlForwardingContext: + def __init__(self, httpserver: HTTPServer, vanilla_pg: VanillaPostgres, host: str, port: int): + self.server = httpserver + self.pg = vanilla_pg + self.host = host + self.port = port + self.dbs: Dict[str, str] = {} + self.roles: Dict[str, str] = {} + endpoint = "/management/api/v2/roles_and_databases" + ddl_url = f"http://{host}:{port}{endpoint}" + self.pg.configure( + [ + f"neon.console_url={ddl_url}", + "shared_preload_libraries = 'neon'", + ] + ) + log.info(f"Listening on {ddl_url}") + self.server.expect_request(endpoint, method="PATCH").respond_with_handler( + lambda request: ddl_forward_handler(request, self.dbs, self.roles) + ) + + def __enter__(self): + self.pg.start() + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + tb: Optional[TracebackType], + ): + self.pg.stop() + + def send(self, query: str) -> List[Tuple[Any, ...]]: + return self.pg.safe_psql(query) + + def wait(self, timeout=3): + self.server.wait(timeout=timeout) + + def send_and_wait(self, query: str, timeout=3) -> List[Tuple[Any, ...]]: + res = self.send(query) + self.wait(timeout=timeout) + return res + + +@pytest.fixture(scope="function") +def ddl( + httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: tuple[str, int] +): + (host, port) = httpserver_listen_address + with DdlForwardingContext(httpserver, vanilla_pg, host, port) as ddl: + yield ddl + + +def test_ddl_forwarding(ddl: DdlForwardingContext): + curr_user = ddl.send("SELECT current_user")[0][0] + log.info(f"Current user is {curr_user}") + ddl.send_and_wait("CREATE DATABASE bork") + assert ddl.dbs == {"bork": curr_user} + ddl.send_and_wait("CREATE ROLE volk WITH PASSWORD 'nu_zayats'") + ddl.send_and_wait("ALTER DATABASE bork RENAME TO nu_pogodi") + assert ddl.dbs == {"nu_pogodi": curr_user} + ddl.send_and_wait("ALTER DATABASE nu_pogodi OWNER TO volk") + assert ddl.dbs == {"nu_pogodi": "volk"} + ddl.send_and_wait("DROP DATABASE nu_pogodi") + assert ddl.dbs == {} + ddl.send_and_wait("DROP ROLE volk") + assert ddl.roles == {} + + ddl.send_and_wait("CREATE ROLE tarzan WITH PASSWORD 'of_the_apes'") + assert ddl.roles == {"tarzan": "of_the_apes"} + ddl.send_and_wait("DROP ROLE tarzan") + assert ddl.roles == {} + ddl.send_and_wait("CREATE ROLE tarzan WITH PASSWORD 'of_the_apes'") + assert ddl.roles == {"tarzan": "of_the_apes"} + ddl.send_and_wait("ALTER ROLE tarzan WITH PASSWORD 'jungle_man'") + assert ddl.roles == {"tarzan": "jungle_man"} + ddl.send_and_wait("ALTER ROLE tarzan RENAME TO mowgli") + assert ddl.roles == {"mowgli": "jungle_man"} + ddl.send_and_wait("DROP ROLE mowgli") + assert ddl.roles == {} + + conn = ddl.pg.connect() + cur = conn.cursor() + + cur.execute("BEGIN") + cur.execute("CREATE ROLE bork WITH PASSWORD 'cork'") + cur.execute("COMMIT") + ddl.wait() + assert ddl.roles == {"bork": "cork"} + cur.execute("BEGIN") + cur.execute("CREATE ROLE stork WITH PASSWORD 'pork'") + cur.execute("ABORT") + ddl.wait() + assert ("stork", "pork") not in ddl.roles.items() + cur.execute("BEGIN") + cur.execute("ALTER ROLE bork WITH PASSWORD 'pork'") + cur.execute("ALTER ROLE bork RENAME TO stork") + cur.execute("COMMIT") + ddl.wait() + assert ddl.roles == {"stork": "pork"} + cur.execute("BEGIN") + cur.execute("CREATE ROLE dork WITH PASSWORD 'york'") + cur.execute("SAVEPOINT point") + cur.execute("ALTER ROLE dork WITH PASSWORD 'zork'") + cur.execute("ALTER ROLE dork RENAME TO fork") + cur.execute("ROLLBACK TO SAVEPOINT point") + cur.execute("ALTER ROLE dork WITH PASSWORD 'fork'") + cur.execute("ALTER ROLE dork RENAME TO zork") + cur.execute("RELEASE SAVEPOINT point") + cur.execute("COMMIT") + ddl.wait() + assert ddl.roles == {"stork": "pork", "zork": "fork"} + + cur.execute("DROP ROLE stork") + cur.execute("DROP ROLE zork") + ddl.wait() + assert ddl.roles == {} + + cur.execute("CREATE ROLE bork WITH PASSWORD 'dork'") + cur.execute("CREATE ROLE stork WITH PASSWORD 'cork'") + cur.execute("BEGIN") + cur.execute("DROP ROLE bork") + cur.execute("ALTER ROLE stork RENAME TO bork") + cur.execute("COMMIT") + ddl.wait() + assert ddl.roles == {"bork": "cork"} + + cur.execute("DROP ROLE bork") + ddl.wait() + assert ddl.roles == {} + + cur.execute("CREATE ROLE bork WITH PASSWORD 'dork'") + cur.execute("CREATE DATABASE stork WITH OWNER=bork") + cur.execute("ALTER ROLE bork RENAME TO cork") + ddl.wait() + assert ddl.dbs == {"stork": "cork"} + + with pytest.raises(psycopg2.InternalError): + global fail + fail = True + cur.execute("CREATE DATABASE failure WITH OWNER=cork") + ddl.wait() + + conn.close() diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index ab67518092..0ec023b9e1 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -110,6 +110,12 @@ class EvictionEnv: overrides=( "--pageserver-config-override=disk_usage_based_eviction=" + enc.dump_inline_table(disk_usage_config).replace("\n", " "), + # Disk usage based eviction runs as a background task. + # But pageserver startup delays launch of background tasks for some time, to prioritize initial logical size calculations during startup. + # But, initial logical size calculation may not be triggered if safekeepers don't publish new broker messages. + # But, we only have a 10-second-timeout in this test. + # So, disable the delay for this test. + "--pageserver-config-override=background_task_maximum_delay='0s'", ), ) diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 77030288f0..5c3948b027 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -79,6 +79,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build # Set up pageserver for import neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() + client = env.pageserver.http_client() client.tenant_create(tenant) @@ -145,6 +146,11 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build ) # NOTE: delete can easily come before upload operations are completed + # https://github.com/neondatabase/neon/issues/4326 + env.pageserver.allowed_errors.append( + ".*files not bound to index_file.json, proceeding with their deletion.*" + ) + client.timeline_delete(tenant, timeline) # Importing correct backup works diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py index 00ea77f2e7..12e695bcbd 100644 --- a/test_runner/regress/test_metric_collection.py +++ b/test_runner/regress/test_metric_collection.py @@ -228,7 +228,6 @@ def proxy_with_metric_collector( @pytest.mark.asyncio async def test_proxy_metric_collection( httpserver: HTTPServer, - httpserver_listen_address, proxy_with_metric_collector: NeonProxy, vanilla_pg: VanillaPostgres, ): diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index f6629c54f9..3314e7fbf6 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -9,11 +9,18 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por try: env.neon_cli.start() env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True) - env.neon_cli.endpoint_start(endpoint_id="ep-main", port=port_distributor.get_port()) + + pg_port = port_distributor.get_port() + http_port = port_distributor.get_port() + env.neon_cli.endpoint_start( + endpoint_id="ep-basic-main", pg_port=pg_port, http_port=http_port + ) env.neon_cli.create_branch(new_branch_name="migration_check") + pg_port = port_distributor.get_port() + http_port = port_distributor.get_port() env.neon_cli.endpoint_start( - endpoint_id="ep-migration_check", port=port_distributor.get_port() + endpoint_id="ep-migration_check", pg_port=pg_port, http_port=http_port ) finally: env.neon_cli.stop() diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 1414b4ed8e..c26ec76172 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -58,11 +58,8 @@ def test_ondemand_download_large_rel( ) ##### First start, insert secret data and upload it to the remote storage - env = neon_env_builder.init_start() - - # Override defaults, to create more layers - tenant, _ = env.neon_cli.create_tenant( - conf={ + env = neon_env_builder.init_start( + initial_tenant_conf={ # disable background GC "gc_period": "0s", "gc_horizon": f"{10 * 1024 ** 3}", # 10 GB @@ -75,7 +72,6 @@ def test_ondemand_download_large_rel( "compaction_period": "0s", } ) - env.initial_tenant = tenant endpoint = env.endpoints.create_start("main") diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py index bc3f3f2be4..fc93dcffbb 100644 --- a/test_runner/regress/test_pageserver_restarts_under_workload.py +++ b/test_runner/regress/test_pageserver_restarts_under_workload.py @@ -17,12 +17,6 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB n_restarts = 10 scale = 10 - # the background task may complete the init task delay after finding an - # active tenant, but shutdown starts right before Tenant::gc_iteration - env.pageserver.allowed_errors.append( - r".*Gc failed, retrying in \S+: Cannot run GC iteration on inactive tenant" - ) - def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 6be3995714..ca19dc3fd0 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -225,3 +225,37 @@ def test_sql_over_http(static_proxy: NeonProxy): res = q("drop table t") assert res["command"] == "DROP" assert res["rowCount"] is None + + +def test_sql_over_http_output_options(static_proxy: NeonProxy): + static_proxy.safe_psql("create role http2 with login password 'http2' superuser") + + def q(sql: str, raw_text: bool, array_mode: bool, params: List[Any] = []) -> Any: + connstr = ( + f"postgresql://http2:http2@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" + ) + response = requests.post( + f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + data=json.dumps({"query": sql, "params": params}), + headers={ + "Content-Type": "application/sql", + "Neon-Connection-String": connstr, + "Neon-Raw-Text-Output": "true" if raw_text else "false", + "Neon-Array-Mode": "true" if array_mode else "false", + }, + verify=str(static_proxy.test_output_dir / "proxy.crt"), + ) + assert response.status_code == 200 + return response.json() + + rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", False, False)["rows"] + assert rows == [{"arr": [1, 2, 3], "n": 1, "s": "a"}] + + rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", False, True)["rows"] + assert rows == [[1, "a", [1, 2, 3]]] + + rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", True, False)["rows"] + assert rows == [{"arr": "{1,2,3}", "n": "1", "s": "a"}] + + rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", True, True)["rows"] + assert rows == [["1", "a", "{1,2,3}"]] diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 02f1aac99c..11ac9e2555 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -20,6 +20,7 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( + assert_timeline_detail_404, wait_for_last_record_lsn, wait_for_upload, wait_until_tenant_active, @@ -140,14 +141,19 @@ def test_remote_storage_backup_and_restore( # This is before the failures injected by test_remote_failures, so it's a permanent error. pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return")) env.pageserver.allowed_errors.append( - ".*error attaching tenant: storage-sync-list-remote-timelines", + ".*attach failed.*: storage-sync-list-remote-timelines", ) # Attach it. This HTTP request will succeed and launch a # background task to load the tenant. In that background task, # listing the remote timelines will fail because of the failpoint, # and the tenant will be marked as Broken. client.tenant_attach(tenant_id) - wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) + + tenant_info = wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) + assert tenant_info["attachment_status"] == { + "slug": "failed", + "data": {"reason": "storage-sync-list-remote-timelines"}, + } # Ensure that even though the tenant is broken, we can't attach it again. with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"): @@ -177,7 +183,7 @@ def test_remote_storage_backup_and_restore( wait_until_tenant_active( pageserver_http=client, tenant_id=tenant_id, - iterations=5, + iterations=10, # make it longer for real_s3 tests when unreliable wrapper is involved ) detail = client.timeline_detail(tenant_id, timeline_id) @@ -593,8 +599,23 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( ) client.timeline_delete(tenant_id, timeline_id) + env.pageserver.allowed_errors.append(f".*Timeline {tenant_id}/{timeline_id} was not found.*") + env.pageserver.allowed_errors.append( + ".*files not bound to index_file.json, proceeding with their deletion.*" + ) + + wait_until(2, 0.5, lambda: assert_timeline_detail_404(client, tenant_id, timeline_id)) + assert not timeline_path.exists() + # to please mypy + assert isinstance(env.remote_storage, LocalFsStorage) + remote_timeline_path = ( + env.remote_storage.root / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + ) + + assert not list(remote_timeline_path.iterdir()) + # timeline deletion should kill ongoing uploads, so, the metric will be gone assert get_queued_count(file_kind="index", op_kind="upload") is None @@ -693,15 +714,15 @@ def test_empty_branch_remote_storage_upload_on_restart( f".*POST.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" ) - # index upload is now hitting the failpoint, should not block the shutdown - env.pageserver.stop() + # index upload is now hitting the failpoint, it should block the shutdown + env.pageserver.stop(immediate=True) timeline_path = ( Path("tenants") / str(env.initial_tenant) / "timelines" / str(new_branch_timeline_id) ) local_metadata = env.repo_dir / timeline_path / "metadata" - assert local_metadata.is_file(), "timeout cancelled timeline branching, not the upload" + assert local_metadata.is_file() assert isinstance(env.remote_storage, LocalFsStorage) new_branch_on_remote_storage = env.remote_storage.root / timeline_path diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py index 64cfd017e6..9b78e8287e 100644 --- a/test_runner/regress/test_sni_router.py +++ b/test_runner/regress/test_sni_router.py @@ -4,7 +4,7 @@ from pathlib import Path from types import TracebackType from typing import Optional, Type -import backoff # type: ignore +import backoff from fixtures.log_helper import log from fixtures.neon_fixtures import PgProtocol, PortDistributor, VanillaPostgres @@ -37,6 +37,7 @@ class PgSniRouter(PgProtocol): destination: str, tls_cert: Path, tls_key: Path, + test_output_dir: Path, ): # Must use a hostname rather than IP here, for SNI to work host = "localhost" @@ -49,6 +50,7 @@ class PgSniRouter(PgProtocol): self.tls_cert = tls_cert self.tls_key = tls_key self._popen: Optional[subprocess.Popen[bytes]] = None + self.test_output_dir = test_output_dir def start(self) -> "PgSniRouter": assert self._popen is None @@ -60,8 +62,12 @@ class PgSniRouter(PgProtocol): *["--destination", self.destination], ] - self._popen = subprocess.Popen(args) + router_log_path = self.test_output_dir / "pg_sni_router.log" + router_log = open(router_log_path, "w") + + self._popen = subprocess.Popen(args, stderr=router_log) self._wait_until_ready() + log.info(f"pg_sni_router started, log file: {router_log_path}") return self @backoff.on_exception(backoff.expo, OSError, max_time=10) @@ -121,6 +127,7 @@ def test_pg_sni_router( destination="localtest.me", tls_cert=test_output_dir / "router.crt", tls_key=test_output_dir / "router.key", + test_output_dir=test_output_dir, ) as router: router.start() diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index dc523364dc..7c80d86863 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -62,6 +62,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = log.info(f"show {env.initial_tenant}") pscur.execute(f"show {env.initial_tenant}") res = pscur.fetchone() + assert res is not None assert all( i in res.items() for i in { @@ -101,6 +102,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = pscur.execute(f"show {tenant}") res = pscur.fetchone() log.info(f"res: {res}") + assert res is not None assert all( i in res.items() for i in { @@ -163,6 +165,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = pscur.execute(f"show {tenant}") res = pscur.fetchone() log.info(f"after config res: {res}") + assert res is not None assert all( i in res.items() for i in { @@ -218,6 +221,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = pscur.execute(f"show {tenant}") res = pscur.fetchone() log.info(f"after restart res: {res}") + assert res is not None assert all( i in res.items() for i in { @@ -278,6 +282,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = pscur.execute(f"show {tenant}") res = pscur.fetchone() log.info(f"after restart res: {res}") + assert res is not None assert all( i in res.items() for i in { diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 82664cff94..2a015d5d17 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -59,6 +59,13 @@ def test_tenant_reattach( # create new nenant tenant_id, timeline_id = env.neon_cli.create_tenant() + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: with endpoint.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") @@ -223,13 +230,6 @@ def test_tenant_reattach_while_busy( ) env = neon_env_builder.init_start() - # Attempts to connect from compute to pageserver while the tenant is - # temporarily detached produces these errors in the pageserver log. - env.pageserver.allowed_errors.append(".*Tenant .* not found.*") - env.pageserver.allowed_errors.append( - ".*Tenant .* will not become active\\. Current state: Stopping.*" - ) - pageserver_http = env.pageserver.http_client() # create new nenant @@ -238,6 +238,13 @@ def test_tenant_reattach_while_busy( conf={"checkpoint_distance": "100000"} ) + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) cur = endpoint.connect().cursor() @@ -275,6 +282,13 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # create new nenant tenant_id, timeline_id = env.neon_cli.create_tenant() + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + # assert tenant exists on disk assert (env.repo_dir / "tenants" / str(tenant_id)).exists() @@ -336,6 +350,13 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv): # create a new tenant tenant_id, _ = env.neon_cli.create_tenant() + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + # assert tenant exists on disk assert (env.repo_dir / "tenants" / str(tenant_id)).exists() @@ -385,6 +406,13 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv): # create a new tenant tenant_id, _ = env.neon_cli.create_tenant() + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + # assert tenant exists on disk assert (env.repo_dir / "tenants" / str(tenant_id)).exists() @@ -399,6 +427,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv): log.info("detaching regular tenant with detach ignored flag") client.tenant_detach(tenant_id, True) + log.info("regular tenant detached without error") # check that nothing is left on disk for deleted tenant @@ -432,6 +461,13 @@ def test_detach_while_attaching( tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point @@ -496,7 +532,7 @@ def test_ignored_tenant_reattach( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_remote_storage_backup_and_restore", + test_name="test_ignored_tenant_reattach", ) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -577,6 +613,13 @@ def test_ignored_tenant_download_missing_layers( tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + data_id = 1 data_secret = "very secret secret" insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) @@ -636,6 +679,13 @@ def test_ignored_tenant_stays_broken_without_metadata( tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Broken.*" + ) + # ignore the tenant and remove its metadata pageserver_http.tenant_ignore(tenant_id) tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) @@ -647,7 +697,9 @@ def test_ignored_tenant_stays_broken_without_metadata( metadata_removed = True assert metadata_removed, f"Failed to find metadata file in {tenant_timeline_dir}" - env.pageserver.allowed_errors.append(".*could not load tenant .*?: failed to load metadata.*") + env.pageserver.allowed_errors.append( + f".*{tenant_id}.*: load failed.*: failed to load metadata.*" + ) # now, load it from the local files and expect it to be broken due to inability to load tenant files into memory pageserver_http.tenant_load(tenant_id=tenant_id) @@ -670,6 +722,13 @@ def test_load_attach_negatives( tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*") with pytest.raises( expected_exception=PageserverApiException, @@ -712,6 +771,13 @@ def test_ignore_while_attaching( tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + data_id = 1 data_secret = "very secret secret" insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 60ab268882..e9dcd1e5cd 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -318,7 +318,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa def test_single_branch_get_tenant_size_grows( - neon_env_builder: NeonEnvBuilder, test_output_dir: Path + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion ): """ Operate on single branch reading the tenants size after each transaction. @@ -333,6 +333,13 @@ def test_single_branch_get_tenant_size_grows( # that there next_gc_cutoff could be smaller than initdb_lsn, which will # obviously lead to issues when calculating the size. gc_horizon = 0x38000 + + # it's a bit of a hack, but different versions of postgres have different + # amount of WAL generated for the same amount of data. so we need to + # adjust the gc_horizon accordingly. + if pg_version == PgVersion.V14: + gc_horizon = 0x40000 + neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}" env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 6599fa7ba5..aef2df4932 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -22,6 +22,7 @@ from fixtures.neon_fixtures import ( available_remote_storages, ) from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import wait_until from prometheus_client.samples import Sample @@ -266,6 +267,7 @@ def test_pageserver_metrics_removed_after_detach( cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") cur.execute("SELECT sum(key) FROM t") assert cur.fetchone() == (5000050000,) + endpoint.stop() def get_ps_metric_samples_for_tenant(tenant_id: TenantId) -> List[Sample]: ps_metrics = env.pageserver.http_client().get_metrics() @@ -308,9 +310,7 @@ def test_pageserver_with_empty_tenants( env.pageserver.allowed_errors.append( ".*marking .* as locally complete, while it doesnt exist in remote index.*" ) - env.pageserver.allowed_errors.append( - ".*could not load tenant.*Failed to list timelines directory.*" - ) + env.pageserver.allowed_errors.append(".*load failed.*list timelines directory.*") client = env.pageserver.http_client() @@ -341,9 +341,15 @@ def test_pageserver_with_empty_tenants( env.pageserver.start() client = env.pageserver.http_client() - tenants = client.tenant_list() - assert len(tenants) == 2 + def not_loading(): + tenants = client.tenant_list() + assert len(tenants) == 2 + assert all(t["state"]["slug"] != "Loading" for t in tenants) + + wait_until(10, 0.2, not_loading) + + tenants = client.tenant_list() [broken_tenant] = [t for t in tenants if t["id"] == str(tenant_without_timelines_dir)] assert ( @@ -355,7 +361,7 @@ def test_pageserver_with_empty_tenants( broken_tenant_status["state"]["slug"] == "Broken" ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken" - assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*") + assert env.pageserver.log_contains(".*load failed, setting tenant state to Broken:.*") [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines)] assert ( diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 7135b621cb..28b15d03ca 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -3,6 +3,7 @@ import queue import shutil import threading from pathlib import Path +from typing import Optional import pytest import requests @@ -11,13 +12,16 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, RemoteStorageKind, + S3Storage, available_remote_storages, ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( + assert_timeline_detail_404, wait_for_last_record_lsn, wait_for_upload, wait_until_tenant_active, + wait_until_timeline_state, ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until @@ -68,7 +72,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): ps_http.timeline_delete(env.initial_tenant, parent_timeline_id) - assert exc.value.status_code == 400 + assert exc.value.status_code == 412 timeline_path = ( env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id) @@ -130,13 +134,25 @@ def test_delete_timeline_post_rm_failure( env = neon_env_builder.init_start() assert env.initial_timeline + env.pageserver.allowed_errors.append(".*Error: failpoint: timeline-delete-after-rm") + env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline") + ps_http = env.pageserver.http_client() failpoint_name = "timeline-delete-after-rm" ps_http.configure_failpoints((failpoint_name, "return")) - with pytest.raises(PageserverApiException, match=f"failpoint: {failpoint_name}"): - ps_http.timeline_delete(env.initial_tenant, env.initial_timeline) + ps_http.timeline_delete(env.initial_tenant, env.initial_timeline) + + timeline_info = wait_until_timeline_state( + pageserver_http=ps_http, + tenant_id=env.initial_tenant, + timeline_id=env.initial_timeline, + expected_state="Broken", + iterations=2, # effectively try immediately and retry once in one second + ) + + timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm" at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*" env.pageserver.allowed_errors.append(at_failpoint_log_message) @@ -148,11 +164,14 @@ def test_delete_timeline_post_rm_failure( ps_http.configure_failpoints((failpoint_name, "off")) # this should succeed + # this also checks that delete can be retried even when timeline is in Broken state ps_http.timeline_delete(env.initial_tenant, env.initial_timeline, timeout=2) - # the second call will try to transition the timeline into Stopping state, but it's already in that state - env.pageserver.allowed_errors.append( - f".*{env.initial_timeline}.*Ignoring new state, equal to the existing one: Stopping" - ) + with pytest.raises(PageserverApiException) as e: + ps_http.timeline_detail(env.initial_tenant, env.initial_timeline) + + assert e.value.status_code == 404 + + env.pageserver.allowed_errors.append(f".*NotFound: Timeline.*{env.initial_timeline}.*") env.pageserver.allowed_errors.append( f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*" ) @@ -230,6 +249,12 @@ def test_timeline_resurrection_on_attach( # delete new timeline ps_http.timeline_delete(tenant_id=tenant_id, timeline_id=branch_timeline_id) + env.pageserver.allowed_errors.append( + f".*Timeline {tenant_id}/{branch_timeline_id} was not found.*" + ) + + wait_until(2, 0.5, lambda: assert_timeline_detail_404(ps_http, tenant_id, branch_timeline_id)) + ##### Stop the pageserver instance, erase all its data env.endpoints.stop_all() env.pageserver.stop() @@ -252,12 +277,31 @@ def test_timeline_resurrection_on_attach( assert all([tl["state"] == "Active" for tl in timelines]) +def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str] = None): + # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api. + assert neon_env_builder.remote_storage_kind in ( + RemoteStorageKind.MOCK_S3, + RemoteStorageKind.REAL_S3, + ) + # For mypy + assert isinstance(neon_env_builder.remote_storage, S3Storage) + + # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive. + response = neon_env_builder.remote_storage_client.list_objects_v2( + Bucket=neon_env_builder.remote_storage.bucket_name, + Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "", + ) + objects = response.get("Contents") + assert ( + response["KeyCount"] == 0 + ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}" + + def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuilder): """ When deleting a timeline, if we succeed in setting the deleted flag remotely but fail to delete the local state, restarting the pageserver should resume the deletion of the local state. - (Deletion of the state in S3 is not implemented yet.) """ neon_env_builder.enable_remote_storage( @@ -271,8 +315,9 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild env.pageserver.allowed_errors.append( ".*Ignoring new state, equal to the existing one: Stopping" ) + # this happens, because the stuck timeline is visible to shutdown env.pageserver.allowed_errors.append( - ".*during shutdown: cannot flush frozen layers when flush_loop is not running, state is Exited" + ".*freeze_and_flush_on_shutdown.+: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited" ) ps_http = env.pageserver.http_client() @@ -292,11 +337,17 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id) ) - with pytest.raises( - PageserverApiException, - match="failpoint: timeline-delete-before-rm", - ): - ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id) + ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id) + + timeline_info = wait_until_timeline_state( + pageserver_http=ps_http, + tenant_id=env.initial_tenant, + timeline_id=leaf_timeline_id, + expected_state="Broken", + iterations=2, # effectively try immediately and retry once in one second + ) + + timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm" assert leaf_timeline_path.exists(), "the failpoint didn't work" @@ -304,7 +355,14 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild env.pageserver.start() # Wait for tenant to finish loading. - wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=0.5) + wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=1) + + env.pageserver.allowed_errors.append( + f".*Timeline {env.initial_tenant}/{leaf_timeline_id} was not found.*" + ) + wait_until( + 2, 0.5, lambda: assert_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id) + ) assert ( not leaf_timeline_path.exists() @@ -316,6 +374,50 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild }, "other timelines should not have been affected" assert all([tl["state"] == "Active" for tl in timelines]) + assert_prefix_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(env.initial_tenant), + "timelines", + str(leaf_timeline_id), + ) + ), + ) + + assert env.initial_timeline is not None + + for timeline_id in (intermediate_timeline_id, env.initial_timeline): + ps_http.timeline_delete(env.initial_tenant, timeline_id) + + env.pageserver.allowed_errors.append( + f".*Timeline {env.initial_tenant}/{timeline_id} was not found.*" + ) + wait_until( + 2, 0.5, lambda: assert_timeline_detail_404(ps_http, env.initial_tenant, timeline_id) + ) + + assert_prefix_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(env.initial_tenant), + "timelines", + str(timeline_id), + ) + ), + ) + + # for some reason the check above doesnt immediately take effect for the below. + # Assume it is mock server incosistency and check twice. + wait_until( + 2, + 0.5, + lambda: assert_prefix_empty(neon_env_builder), + ) + def test_concurrent_timeline_delete_if_first_stuck_at_index_upload( neon_env_builder: NeonEnvBuilder, @@ -371,7 +473,7 @@ def test_concurrent_timeline_delete_if_first_stuck_at_index_upload( # make the second call and assert behavior log.info("second call start") - error_msg_re = "another task is already setting the deleted_flag, started at" + error_msg_re = "timeline deletion is already in progress" with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err: ps_http.timeline_delete(env.initial_tenant, child_timeline_id) assert second_call_err.value.status_code == 500 @@ -437,12 +539,106 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): wait_until(50, 0.1, got_hangup_log_message) - # ok, retry without failpoint, it should succeed + # check that the timeline is still present + ps_http.timeline_detail(env.initial_tenant, child_timeline_id) + + # ok, disable the failpoint to let the deletion finish ps_http.configure_failpoints((failpoint_name, "off")) - # this should succeed - ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2) - # the second call will try to transition the timeline into Stopping state, but it's already in that state - env.pageserver.allowed_errors.append( - f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping" + def first_request_finished(): + message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished" + assert env.pageserver.log_contains(message) + + wait_until(50, 0.1, first_request_finished) + + # check that the timeline is gone + notfound_message = f"Timeline {env.initial_tenant}/{child_timeline_id} was not found" + env.pageserver.allowed_errors.append(".*" + notfound_message) + with pytest.raises(PageserverApiException, match=notfound_message) as exc: + ps_http.timeline_detail(env.initial_tenant, child_timeline_id) + + assert exc.value.status_code == 404 + + +@pytest.mark.parametrize( + "remote_storage_kind", + list( + filter( + lambda s: s in (RemoteStorageKind.MOCK_S3, RemoteStorageKind.REAL_S3), + available_remote_storages(), + ) + ), +) +def test_timeline_delete_works_for_remote_smoke( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_timeline_delete_works_for_remote_smoke", + ) + + env = neon_env_builder.init_start() + + ps_http = env.pageserver.http_client() + pg = env.endpoints.create_start("main") + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + main_timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + assert tenant_id == env.initial_tenant + assert main_timeline_id == env.initial_timeline + + timeline_ids = [env.initial_timeline] + for i in range(2): + branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main") + pg = env.endpoints.create_start(f"new{i}") + + with pg.cursor() as cur: + cur.execute("CREATE TABLE f (i integer);") + cur.execute("INSERT INTO f VALUES (generate_series(1,1000));") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # wait until pageserver receives that data + wait_for_last_record_lsn(ps_http, tenant_id, branch_timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + ps_http.timeline_checkpoint(tenant_id, branch_timeline_id) + + # wait until pageserver successfully uploaded a checkpoint to remote storage + log.info("waiting for checkpoint upload") + wait_for_upload(ps_http, tenant_id, branch_timeline_id, current_lsn) + log.info("upload of checkpoint is done") + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + timeline_ids.append(timeline_id) + + for timeline_id in reversed(timeline_ids): + # note that we need to finish previous deletion before scheduling next one + # otherwise we can get an "HasChildren" error if deletion is not fast enough (real_s3) + ps_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id) + + env.pageserver.allowed_errors.append( + f".*Timeline {env.initial_tenant}/{timeline_id} was not found.*" + ) + wait_until(2, 0.5, lambda: assert_timeline_detail_404(ps_http, tenant_id, timeline_id)) + + assert_prefix_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(env.initial_tenant), + "timelines", + str(timeline_id), + ) + ), + ) + + # for some reason the check above doesnt immediately take effect for the below. + # Assume it is mock server incosistency and check twice. + wait_until( + 2, + 0.5, + lambda: assert_prefix_empty(neon_env_builder), ) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 2a4141ed30..8b595596cb 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1001,9 +1001,6 @@ def test_safekeeper_without_pageserver( def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): - def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str: - return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names]) - def execute_payload(endpoint: Endpoint): with closing(endpoint.connect()) as conn: with conn.cursor() as cur: @@ -1032,9 +1029,8 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() - active_safekeepers = [1, 2, 3] endpoint = env.endpoints.create("test_replace_safekeeper") - endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) + endpoint.active_safekeepers = [1, 2, 3] endpoint.start() # learn neon timeline from compute @@ -1072,9 +1068,8 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): log.info("Recreate postgres to replace failed sk1 with new sk4") endpoint.stop_and_destroy().create("test_replace_safekeeper") - active_safekeepers = [2, 3, 4] env.safekeepers[3].start() - endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) + endpoint.active_safekeepers = [2, 3, 4] endpoint.start() execute_payload(endpoint) @@ -1293,9 +1288,8 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() - active_safekeepers = [1, 2, 3] endpoint = env.endpoints.create("test_pull_timeline") - endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) + endpoint.active_safekeepers = [1, 2, 3] endpoint.start() # learn neon timeline from compute @@ -1332,10 +1326,8 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) log.info("Restarting compute with new config to verify that it works") - active_safekeepers = [1, 3, 4] - endpoint.stop_and_destroy().create("test_pull_timeline") - endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) + endpoint.active_safekeepers = [1, 3, 4] endpoint.start() execute_payload(endpoint) diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 7debeed140..ce33975a0e 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -2,9 +2,11 @@ import asyncio import random import time from dataclasses import dataclass +from pathlib import Path from typing import List, Optional import asyncpg +import toml from fixtures.log_helper import getLogger from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper from fixtures.types import Lsn, TenantId, TimelineId @@ -251,7 +253,8 @@ def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): endpoint = Endpoint( env, tenant_id=env.initial_tenant, - port=env.port_distributor.get_port(), + pg_port=env.port_distributor.get_port(), + http_port=env.port_distributor.get_port(), # In these tests compute has high probability of terminating on its own # before our stop() due to lost consensus leadership. check_stop_result=False, @@ -536,15 +539,20 @@ def test_race_conditions(neon_env_builder: NeonEnvBuilder): # Check that pageserver can select safekeeper with largest commit_lsn # and switch if LSN is not updated for some time (NoWalTimeout). -async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint): - def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str: - # use ports 10, 11 and 12 to simulate unavailable safekeepers - return ",".join( - [ - f"localhost:{sk.port.pg if active else 10 + i}" - for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk)) - ] - ) +async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Path): + def adjust_safekeepers(env: NeonEnv, active_sk: List[bool]): + # Change the pg ports of the inactive safekeepers in the config file to be + # invalid, to make them unavailable to the endpoint. We use + # ports 10, 11 and 12 to simulate unavailable safekeepers. + config = toml.load(test_output_dir / "repo" / "config") + for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk)): + if active: + config["safekeepers"][i]["pg_port"] = env.safekeepers[i].port.pg + else: + config["safekeepers"][i]["pg_port"] = 10 + i + + with open(test_output_dir / "repo" / "config", "w") as f: + toml.dump(config, f) conn = await endpoint.connect_async() await conn.execute("CREATE TABLE t(key int primary key, value text)") @@ -565,7 +573,7 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint): it -= 1 continue - endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_sk)) + adjust_safekeepers(env, active_sk) log.info(f"Iteration {it}: {active_sk}") endpoint.start() @@ -579,7 +587,7 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint): await conn.close() endpoint.stop() - endpoint.adjust_for_safekeepers(safekeepers_guc(env, [True] * len(env.safekeepers))) + adjust_safekeepers(env, [True] * len(env.safekeepers)) endpoint.start() conn = await endpoint.connect_async() @@ -590,11 +598,11 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint): # do inserts while restarting postgres and messing with safekeeper addresses -def test_wal_lagging(neon_env_builder: NeonEnvBuilder): +def test_wal_lagging(neon_env_builder: NeonEnvBuilder, test_output_dir: Path): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() env.neon_cli.create_branch("test_wal_lagging") endpoint = env.endpoints.create_start("test_wal_lagging") - asyncio.run(run_wal_lagging(env, endpoint)) + asyncio.run(run_wal_lagging(env, endpoint, test_output_dir)) diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index 8e4e154be1..515d47c079 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -77,7 +77,8 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil try: trigger_wait_lsn_timeout(env, tenant_id) except Exception as e: - exception_string = str(e) + # Strip out the part before stdout, as it contains full command with the list of all safekeepers + exception_string = str(e).split("stdout", 1)[-1] assert expected_timeout_error in exception_string, "Should time out during waiting for WAL" for safekeeper in env.safekeepers: diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index 7d944bebb3..4a47898935 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -83,6 +83,9 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): # XXX this is quite brittle as the lifecycle of the WAL redo process is an implementation detail assert_child_processes(pagserver_pid, wal_redo_present=True, defunct_present=False) + # Stop the compute before detaching, to avoid errors in the log. + endpoint.stop() + last_error = None for i in range(3): try: diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 3d40f5dede..677b59f453 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -27,7 +27,6 @@ futures-core = { version = "0.3" } futures-executor = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } -hashbrown = { version = "0.12", features = ["raw"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -39,7 +38,7 @@ num-traits = { version = "0.2", features = ["i128"] } prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } -regex-syntax = { version = "0.6" } +regex-syntax = { version = "0.7" } reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] } ring = { version = "0.16", features = ["std"] } rustls = { version = "0.20", features = ["dangerous_configuration"] } @@ -62,7 +61,6 @@ url = { version = "2", features = ["serde"] } anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } either = { version = "1" } -hashbrown = { version = "0.12", features = ["raw"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -70,7 +68,7 @@ memchr = { version = "2" } nom = { version = "7" } prost = { version = "0.11" } regex = { version = "1" } -regex-syntax = { version = "0.6" } +regex-syntax = { version = "0.7" } serde = { version = "1", features = ["alloc", "derive"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] } syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit-mut"] }