diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 4493985587..dec1f47e47 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -36,14 +36,6 @@ inputs: description: 'Region name for real s3 tests' required: false default: '' - real_s3_access_key_id: - description: 'Access key id' - required: false - default: '' - real_s3_secret_access_key: - description: 'Secret access key' - required: false - default: '' rerun_flaky: description: 'Whether to rerun flaky tests' required: false @@ -104,8 +96,6 @@ runs: COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install TEST_OUTPUT: /tmp/test_output BUILD_TYPE: ${{ inputs.build_type }} - AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }} - AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }} COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage') ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 845a21ad0e..b732095f8f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -346,10 +346,8 @@ jobs: test_selection: regress needs_postgres_source: true run_with_real_s3: true - real_s3_bucket: ci-tests-s3 - real_s3_region: us-west-2 - real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}" - real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}" + real_s3_bucket: neon-github-ci-tests + real_s3_region: eu-central-1 rerun_flaky: true pg_version: ${{ matrix.pg_version }} env: @@ -409,9 +407,7 @@ jobs: uses: ./.github/actions/allure-report-generate - uses: actions/github-script@v6 - if: > - !cancelled() && - github.event_name == 'pull_request' + if: ${{ !cancelled() }} with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 @@ -421,7 +417,7 @@ jobs: reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}", } - const script = require("./scripts/pr-comment-test-report.js") + const script = require("./scripts/comment-test-report.js") await script({ github, context, @@ -496,19 +492,24 @@ jobs: env: COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }} run: | - scripts/coverage \ - --dir=/tmp/coverage report \ + scripts/coverage --dir=/tmp/coverage \ + report \ --input-objects=/tmp/coverage/binaries.list \ --commit-url=${COMMIT_URL} \ --format=github + scripts/coverage --dir=/tmp/coverage \ + report \ + --input-objects=/tmp/coverage/binaries.list \ + --format=lcov + - name: Upload coverage report id: upload-coverage-report env: BUCKET: neon-github-public-dev COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} run: | - aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://neon-github-public-dev/code-coverage/${COMMIT_SHA} + aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA} REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT @@ -663,6 +664,9 @@ jobs: project: nrdv0s4kcs push: true tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}} + build-args: | + GIT_VERSION=${{ github.sha }} + REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com compute-tools-image: runs-on: [ self-hosted, gen3, large ] @@ -777,7 +781,7 @@ jobs: run: shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.7.3-alpha3 + VM_BUILDER_VERSION: v0.8.0 steps: - name: Checkout @@ -798,7 +802,7 @@ jobs: - name: Build vm image run: | - ./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + ./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - name: Pushing vm-compute-node image run: | diff --git a/Cargo.lock b/Cargo.lock index 2223453a08..d390df94e0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,17 +17,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" -[[package]] -name = "ahash" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" -dependencies = [ - "getrandom", - "once_cell", - "version_check", -] - [[package]] name = "ahash" version = "0.8.3" @@ -41,9 +30,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "0.7.20" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04" dependencies = [ "memchr", ] @@ -65,9 +54,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.3.0" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e579a7752471abc2a8268df8b20005e3eadd975f585398f17efcfd8d4927371" +checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" dependencies = [ "anstyle", "anstyle-parse", @@ -104,9 +93,9 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcd8291a340dd8ac70e18878bc4501dd7b4ff970cfa21c207d36ece51ea88fd" +checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" dependencies = [ "anstyle", "windows-sys 0.48.0", @@ -114,9 +103,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.70" +version = "1.0.71" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7de8ce5e0f9f8d88245311066a578d72b7af3e7088f32783804676302df237e4" +checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8" dependencies = [ "backtrace", ] @@ -188,7 +177,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -199,7 +188,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -230,9 +219,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc00553f5f3c06ffd4510a9d576f92143618706c45ea6ff81e84ad9be9588abd" +checksum = "bcdcf0d683fe9c23d32cf5b53c9918ea0a500375a9fb20109802552658e576c9" dependencies = [ "aws-credential-types", "aws-http", @@ -256,9 +245,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cb57ac6088805821f78d282c0ba8aec809f11cbee10dda19a97b03ab040ccc2" +checksum = "1fcdb2f7acbc076ff5ad05e7864bdb191ca70a6fd07668dc3a1a8bcd051de5ae" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -270,9 +259,9 @@ dependencies = [ [[package]] name = "aws-endpoint" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c5f6f84a4f46f95a9bb71d9300b73cd67eb868bc43ae84f66ad34752299f4ac" +checksum = "8cce1c41a6cfaa726adee9ebb9a56fcd2bbfd8be49fd8a04c5e20fd968330b04" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -284,9 +273,9 @@ dependencies = [ [[package]] name = "aws-http" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a754683c322f7dc5167484266489fdebdcd04d26e53c162cad1f3f949f2c5671" +checksum = "aadbc44e7a8f3e71c8b374e03ecd972869eb91dd2bc89ed018954a52ba84bc44" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -303,9 +292,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "0.25.1" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "392b9811ca489747ac84349790e49deaa1f16631949e7dd4156000251c260eae" +checksum = "37c77060408d653d3efa6ea7b66c1389bc35a0342352984c8bf8bcb814a8fc27" dependencies = [ "aws-credential-types", "aws-endpoint", @@ -336,9 +325,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "0.27.0" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d0fbe3c2c342bc8dfea4bb43937405a8ec06f99140a0dcb9c7b59e54dfa93a1" +checksum = "265fac131fbfc188e5c3d96652ea90ecc676a934e3174eaaee523c6cec040b3b" dependencies = [ "aws-credential-types", "aws-endpoint", @@ -362,9 +351,9 @@ dependencies = [ [[package]] name = "aws-sig-auth" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84dc92a63ede3c2cbe43529cb87ffa58763520c96c6a46ca1ced80417afba845" +checksum = "3b94acb10af0c879ecd5c7bdf51cda6679a0a4f4643ce630905a77673bfa3c61" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -377,9 +366,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "392fefab9d6fcbd76d518eb3b1c040b84728ab50f58df0c3c53ada4bea9d327e" +checksum = "9d2ce6f507be68e968a33485ced670111d1cbad161ddbbab1e313c03d37d8f4c" dependencies = [ "aws-smithy-eventstream", "aws-smithy-http", @@ -398,9 +387,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae23b9fe7a07d0919000116c4c5c0578303fbce6fc8d32efca1f7759d4c20faf" +checksum = "13bda3996044c202d75b91afeb11a9afae9db9a721c6a7a427410018e286b880" dependencies = [ "futures-util", "pin-project-lite", @@ -410,9 +399,9 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6367acbd6849b8c7c659e166955531274ae147bf83ab4312885991f6b6706cb" +checksum = "07ed8b96d95402f3f6b8b57eb4e0e45ee365f78b1a924faf20ff6e97abf1eae6" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -431,9 +420,9 @@ dependencies = [ [[package]] name = "aws-smithy-client" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5230d25d244a51339273b8870f0f77874cd4449fb4f8f629b21188ae10cfc0ba" +checksum = "0a86aa6e21e86c4252ad6a0e3e74da9617295d8d6e374d552be7d3059c41cedd" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -444,7 +433,7 @@ dependencies = [ "http", "http-body", "hyper", - "hyper-rustls", + "hyper-rustls 0.23.2", "lazy_static", "pin-project-lite", "rustls 0.20.8", @@ -455,9 +444,9 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22d2a2bcc16e5c4d949ffd2b851da852b9bbed4bb364ed4ae371b42137ca06d9" +checksum = "460c8da5110835e3d9a717c61f5556b20d03c32a1dec57f8fc559b360f733bb8" dependencies = [ "aws-smithy-types", "bytes", @@ -466,9 +455,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b60e2133beb9fe6ffe0b70deca57aaeff0a35ad24a9c6fab2fd3b4f45b99fdb5" +checksum = "2b3b693869133551f135e1f2c77cb0b8277d9e3e17feaf2213f735857c4f0d28" dependencies = [ "aws-smithy-eventstream", "aws-smithy-types", @@ -489,9 +478,9 @@ dependencies = [ [[package]] name = "aws-smithy-http-tower" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a4d94f556c86a0dd916a5d7c39747157ea8cb909ca469703e20fee33e448b67" +checksum = "3ae4f6c5798a247fac98a867698197d9ac22643596dc3777f0c76b91917616b9" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -505,18 +494,18 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce3d6e6ebb00b2cce379f079ad5ec508f9bcc3a9510d9b9c1840ed1d6f8af39" +checksum = "23f9f42fbfa96d095194a632fbac19f60077748eba536eb0b9fecc28659807f8" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-query" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d58edfca32ef9bfbc1ca394599e17ea329cb52d6a07359827be74235b64b3298" +checksum = "98819eb0b04020a1c791903533b638534ae6c12e2aceda3e6e6fba015608d51d" dependencies = [ "aws-smithy-types", "urlencoding", @@ -524,9 +513,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58db46fc1f4f26be01ebdb821751b4e2482cd43aa2b64a0348fb89762defaffa" +checksum = "16a3d0bf4f324f4ef9793b86a1701d9700fbcdbd12a846da45eed104c634c6e8" dependencies = [ "base64-simd", "itoa", @@ -537,18 +526,18 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb557fe4995bd9ec87fb244bbb254666a971dc902a783e9da8b7711610e9664c" +checksum = "b1b9d12875731bd07e767be7baad95700c3137b56730ec9ddeedb52a5e5ca63b" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "0.55.2" +version = "0.55.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de0869598bfe46ec44ffe17e063ed33336e59df90356ca8ff0e8da6f7c1d994b" +checksum = "6dd209616cc8d7bfb82f87811a5c655dc97537f592689b18743bddf5dc5c4829" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -562,9 +551,9 @@ dependencies = [ [[package]] name = "axum" -version = "0.6.15" +version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b32c5ea3aabaf4deb5f5ced2d688ec0844c881c9e6c696a8b769a05fc691e62" +checksum = "f8175979259124331c1d7bf6586ee7e0da434155e4b2d48ec2c8386281d8df39" dependencies = [ "async-trait", "axum-core", @@ -634,9 +623,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5" [[package]] name = "base64" -version = "0.21.0" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" +checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105" [[package]] name = "base64-simd" @@ -670,13 +659,13 @@ dependencies = [ "lazycell", "log", "peeking_take_while", - "prettyplease 0.2.4", + "prettyplease 0.2.6", "proc-macro2", "quote", "regex", "rustc-hash", "shlex", - "syn 2.0.15", + "syn 2.0.16", "which", ] @@ -697,9 +686,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3d4260bcc2e8fc9df1eac4919a720effeb63a3f0952f5bf4944adfa18897f09" +checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5" dependencies = [ "memchr", "once_cell", @@ -709,9 +698,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.12.0" +version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" +checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" [[package]] name = "byteorder" @@ -780,9 +769,9 @@ dependencies = [ [[package]] name = "ciborium" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0c137568cc60b904a7724001b35ce2630fd00d5d84805fbb608ab89509d788f" +checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926" dependencies = [ "ciborium-io", "ciborium-ll", @@ -791,15 +780,15 @@ dependencies = [ [[package]] name = "ciborium-io" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "346de753af073cc87b52b2083a506b38ac176a44cfb05497b622e27be899b369" +checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656" [[package]] name = "ciborium-ll" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "213030a2b5a4e0c0892b6652260cf6ccac84827b83a85a534e178e3906c4cf1b" +checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" dependencies = [ "ciborium-io", "half", @@ -818,9 +807,9 @@ dependencies = [ [[package]] name = "clap" -version = "3.2.23" +version = "3.2.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5" +checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123" dependencies = [ "bitflags", "clap_lex 0.2.4", @@ -830,9 +819,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.2.2" +version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b802d85aaf3a1cdb02b224ba472ebdea62014fccfcb269b95a4d76443b5ee5a" +checksum = "93aae7a4192245f70fe75dd9157fc7b4a5bf53e88d30bd4396f7d8f9284d5acc" dependencies = [ "clap_builder", "clap_derive", @@ -841,27 +830,27 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.2.2" +version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14a1a858f532119338887a4b8e1af9c60de8249cd7bafd68036a489e261e37b6" +checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990" dependencies = [ "anstream", "anstyle", "bitflags", - "clap_lex 0.4.1", + "clap_lex 0.5.0", "strsim", ] [[package]] name = "clap_derive" -version = "4.2.0" +version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9644cd56d6b87dbe899ef8b053e331c0637664e9e21a33dfcdc36093f5c5c4" +checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -875,9 +864,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.4.1" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a2dd5a6fe8c6e3502f568a6353e5273bbb15193ad9a89e457b9970798efbea1" +checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" [[package]] name = "close_fds" @@ -889,16 +878,6 @@ dependencies = [ "libc", ] -[[package]] -name = "codespan-reporting" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" -dependencies = [ - "termcolor", - "unicode-width", -] - [[package]] name = "colorchoice" version = "1.0.0" @@ -936,7 +915,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 4.2.2", + "clap 4.3.0", "compute_api", "futures", "hyper", @@ -998,7 +977,7 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.2.2", + "clap 4.3.0", "comfy-table", "compute_api", "git-version", @@ -1041,9 +1020,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181" +checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58" dependencies = [ "libc", ] @@ -1076,7 +1055,7 @@ dependencies = [ "atty", "cast", "ciborium", - "clap 3.2.23", + "clap 3.2.25", "criterion-plot", "itertools", "lazy_static", @@ -1186,55 +1165,11 @@ dependencies = [ "typenum", ] -[[package]] -name = "cxx" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f61f1b6389c3fe1c316bf8a4dccc90a38208354b330925bce1f74a6c4756eb93" -dependencies = [ - "cc", - "cxxbridge-flags", - "cxxbridge-macro", - "link-cplusplus", -] - -[[package]] -name = "cxx-build" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12cee708e8962df2aeb38f594aae5d827c022b6460ac71a7a3e2c3c2aae5a07b" -dependencies = [ - "cc", - "codespan-reporting", - "once_cell", - "proc-macro2", - "quote", - "scratch", - "syn 2.0.15", -] - -[[package]] -name = "cxxbridge-flags" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7944172ae7e4068c533afbb984114a56c46e9ccddda550499caa222902c7f7bb" - -[[package]] -name = "cxxbridge-macro" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.15", -] - [[package]] name = "darling" -version = "0.14.4" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" +checksum = "0558d22a7b463ed0241e993f76f09f30b126687447751a8638587b864e4b3944" dependencies = [ "darling_core", "darling_macro", @@ -1242,27 +1177,27 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.14.4" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" +checksum = "ab8bfa2e259f8ee1ce5e97824a3c55ec4404a0d772ca7fa96bf19f0752a046eb" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", "strsim", - "syn 1.0.109", + "syn 2.0.16", ] [[package]] name = "darling_macro" -version = "0.14.4" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" +checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 1.0.109", + "syn 2.0.16", ] [[package]] @@ -1280,9 +1215,9 @@ dependencies = [ [[package]] name = "data-encoding" -version = "2.3.3" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23d8666cb01533c39dde32bcbab8e227b4ed6679b2c925eba05feabea39508fb" +checksum = "c2e66c9d817f1720209181c316d28635c050fa304f9c79e47a520882661b7308" [[package]] name = "debugid" @@ -1310,9 +1245,9 @@ dependencies = [ [[package]] name = "digest" -version = "0.10.6" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", @@ -1321,13 +1256,13 @@ dependencies = [ [[package]] name = "displaydoc" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bf95dc3f046b9da4f2d51833c0d3547d8564ef6910f5c1ed130306a75b92886" +checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.16", ] [[package]] @@ -1367,23 +1302,23 @@ dependencies = [ [[package]] name = "enumset" -version = "1.0.12" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19be8061a06ab6f3a6cf21106c873578bf01bd42ad15e0311a9c76161cb1c753" +checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb" dependencies = [ "enumset_derive", ] [[package]] name = "enumset_derive" -version = "0.6.1" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03e7b551eba279bf0fa88b83a46330168c1560a52a94f5126f892f0b364ab3e0" +checksum = "e08b6c6ab82d70f08844964ba10c7babb716de2ecaeab9be5717918a5177d3af" dependencies = [ "darling", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.16", ] [[package]] @@ -1569,7 +1504,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -1667,9 +1602,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.3.18" +version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f8a914c2987b688368b5138aa05321db91f4090cf26118185672ad588bce21" +checksum = "d357c7ae988e7d2182f7d7871d0b963962420b0678b0997ce7de72001aeab782" dependencies = [ "bytes", "fnv", @@ -1704,9 +1639,6 @@ name = "hashbrown" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" -dependencies = [ - "ahash 0.7.6", -] [[package]] name = "hashbrown" @@ -1714,16 +1646,16 @@ version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" dependencies = [ - "ahash 0.8.3", + "ahash", ] [[package]] name = "hashlink" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69fe1fcf8b4278d860ad0548329f892a3631fb63f82574df68275f34cdbe0ffa" +checksum = "0761a1b9491c4f2e3d66aa0f62d0fba0af9a0e2852e4d48ea506632a4b56e6aa" dependencies = [ - "hashbrown 0.12.3", + "hashbrown 0.13.2", ] [[package]] @@ -1892,6 +1824,19 @@ dependencies = [ "tokio-rustls 0.23.4", ] +[[package]] +name = "hyper-rustls" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" +dependencies = [ + "http", + "hyper", + "rustls 0.21.1", + "tokio", + "tokio-rustls 0.24.0", +] + [[package]] name = "hyper-timeout" version = "0.4.1" @@ -1933,12 +1878,11 @@ dependencies = [ [[package]] name = "iana-time-zone-haiku" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" dependencies = [ - "cxx", - "cxx-build", + "cc", ] [[package]] @@ -1999,9 +1943,9 @@ dependencies = [ [[package]] name = "io-lifetimes" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" dependencies = [ "hermit-abi 0.3.1", "libc", @@ -2022,7 +1966,7 @@ checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" dependencies = [ "hermit-abi 0.3.1", "io-lifetimes", - "rustix 0.37.11", + "rustix 0.37.19", "windows-sys 0.48.0", ] @@ -2043,9 +1987,9 @@ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" [[package]] name = "js-sys" -version = "0.3.61" +version = "0.3.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" +checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790" dependencies = [ "wasm-bindgen", ] @@ -2056,7 +2000,7 @@ version = "8.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378" dependencies = [ - "base64 0.21.0", + "base64 0.21.1", "pem", "ring", "serde", @@ -2098,9 +2042,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.141" +version = "0.2.144" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5" +checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1" [[package]] name = "libloading" @@ -2112,15 +2056,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "link-cplusplus" -version = "1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" -dependencies = [ - "cc", -] - [[package]] name = "linux-raw-sys" version = "0.1.4" @@ -2129,9 +2064,9 @@ checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" [[package]] name = "linux-raw-sys" -version = "0.3.1" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d59d8c75012853d2e872fb56bc8a2e53718e2cafe1a4c823143141c6d90c322f" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" [[package]] name = "lock_api" @@ -2316,9 +2251,9 @@ dependencies = [ [[package]] name = "notify" -version = "5.1.0" +version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58ea850aa68a06e48fdb069c0ec44d0d64c8dbffa49bf3b6f7f0a901fdea1ba9" +checksum = "729f63e1ca555a43fe3efa4f3efdf4801c479da85b432242a7b726f353c88486" dependencies = [ "bitflags", "crossbeam-channel", @@ -2329,7 +2264,7 @@ dependencies = [ "libc", "mio", "walkdir", - "windows-sys 0.42.0", + "windows-sys 0.45.0", ] [[package]] @@ -2435,7 +2370,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -2587,6 +2522,21 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" +[[package]] +name = "pagectl" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes", + "clap 4.3.0", + "git-version", + "pageserver", + "postgres_ffi", + "svg_fmt", + "utils", + "workspace_hack", +] + [[package]] name = "pageserver" version = "0.1.0" @@ -2597,7 +2547,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.2.2", + "clap 4.3.0", "close_fds", "const_format", "consumption_metrics", @@ -2753,22 +2703,22 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.0.12" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad29a609b6bcd67fee905812e544992d216af9d755757c05ed2d0e15a74c6ecc" +checksum = "c95a7476719eab1e366eaf73d0260af3021184f18177925b07f54b30089ceead" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.0.12" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55" +checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.16", ] [[package]] @@ -2785,9 +2735,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" [[package]] name = "plotters" @@ -2961,12 +2911,12 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058" +checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" dependencies = [ "proc-macro2", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -2977,9 +2927,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.56" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435" +checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8" dependencies = [ "unicode-ident", ] @@ -2994,7 +2944,7 @@ dependencies = [ "byteorder", "hex", "lazy_static", - "rustix 0.36.12", + "rustix 0.36.14", ] [[package]] @@ -3078,7 +3028,7 @@ dependencies = [ "bstr", "bytes", "chrono", - "clap 4.2.2", + "clap 4.3.0", "consumption_metrics", "futures", "git-version", @@ -3116,7 +3066,7 @@ dependencies = [ "serde", "serde_json", "sha2", - "socket2 0.5.2", + "socket2 0.5.3", "sync_wrapper", "thiserror", "tls-listener", @@ -3139,9 +3089,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.26" +version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" +checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500" dependencies = [ "proc-macro2", ] @@ -3230,13 +3180,13 @@ dependencies = [ [[package]] name = "regex" -version = "1.7.3" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d" +checksum = "d1a59b5d8e97dee33696bf13c5ba8ab85341c002922fba050069326b9c498974" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.7.2", ] [[package]] @@ -3245,7 +3195,7 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" dependencies = [ - "regex-syntax", + "regex-syntax 0.6.29", ] [[package]] @@ -3254,6 +3204,12 @@ version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" +[[package]] +name = "regex-syntax" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" + [[package]] name = "remote_storage" version = "0.1.0" @@ -3283,11 +3239,11 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.16" +version = "0.11.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27b71749df584b7f4cac2c426c127a7c785a5106cc98f7a8feb044115f0fa254" +checksum = "cde824a14b7c14f85caff81225f411faacc04a2013f41670f41443742b1c1c55" dependencies = [ - "base64 0.21.0", + "base64 0.21.1", "bytes", "encoding_rs", "futures-core", @@ -3296,7 +3252,7 @@ dependencies = [ "http", "http-body", "hyper", - "hyper-rustls", + "hyper-rustls 0.24.0", "ipnet", "js-sys", "log", @@ -3305,13 +3261,13 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls 0.20.8", + "rustls 0.21.1", "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", "tokio", - "tokio-rustls 0.23.4", + "tokio-rustls 0.24.0", "tower-service", "url", "wasm-bindgen", @@ -3323,9 +3279,9 @@ dependencies = [ [[package]] name = "reqwest-middleware" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99c50db2c7ccd815f976473dd7d0bde296f8c3b77c383acf4fc021cdcf10852b" +checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d" dependencies = [ "anyhow", "async-trait", @@ -3338,12 +3294,14 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.4.1" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a71d77945a1c5ae9604f0504901e77a1e2e71f2932b1cb8103078179ca62ff8" +checksum = "783e8130d2427ddd7897dd3f814d4a3aea31b05deb42a4fdf8c18258fe5aefd1" dependencies = [ + "anyhow", "async-trait", "getrandom", + "matchit", "opentelemetry", "reqwest", "reqwest-middleware", @@ -3417,9 +3375,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4a36c42d1873f9a77c53bde094f9664d9891bc604a45b4798fd2c389ed12e5b" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" [[package]] name = "rustc-hash" @@ -3447,9 +3405,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.12" +version = "0.36.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0af200a3324fa5bcd922e84e9b55a298ea9f431a489f01961acdebc6e908f25" +checksum = "14e4d67015953998ad0eb82887a0eb0129e18a7e2f3b7b0f6c422fddcd503d62" dependencies = [ "bitflags", "errno", @@ -3461,15 +3419,15 @@ dependencies = [ [[package]] name = "rustix" -version = "0.37.11" +version = "0.37.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85597d61f83914ddeba6a47b3b8ffe7365107221c2e557ed94426489fefb5f77" +checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", - "linux-raw-sys 0.3.1", + "linux-raw-sys 0.3.8", "windows-sys 0.48.0", ] @@ -3487,9 +3445,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.21.0" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07180898a28ed6a7f7ba2311594308f595e3dd2e3c3812fa0a80a47b45f17e5d" +checksum = "c911ba11bc8433e811ce56fde130ccf32f5127cab0e0194e9c68c5a5b671791e" dependencies = [ "log", "ring", @@ -3515,7 +3473,7 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b" dependencies = [ - "base64 0.21.0", + "base64 0.21.1", ] [[package]] @@ -3550,7 +3508,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.2.2", + "clap 4.3.0", "const_format", "crc32c", "fs2", @@ -3624,12 +3582,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" -[[package]] -name = "scratch" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1" - [[package]] name = "sct" version = "0.7.0" @@ -3642,9 +3594,9 @@ dependencies = [ [[package]] name = "security-framework" -version = "2.8.2" +version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a332be01508d814fed64bf28f798a146d73792121129962fdf335bb3c49a4254" +checksum = "1fc758eb7bffce5b308734e9b0c1468893cae9ff70ebf13e7090be8dcbcc83a8" dependencies = [ "bitflags", "core-foundation", @@ -3655,9 +3607,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.8.0" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31c9bb296072e961fcbd8853511dd39c2d8be2deb1e17c6860b1d30732b323b4" +checksum = "f51d0c0d83bec45f16480d0ce0058397a69e48fcdc52d1dc8855fb68acbd31a7" dependencies = [ "core-foundation-sys", "libc", @@ -3755,22 +3707,22 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.160" +version = "1.0.163" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb2f3770c8bce3bcda7e149193a069a0f4365bda1fa5cd88e03bca26afc1216c" +checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.160" +version = "1.0.163" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291a097c63d8497e00160b166a967a4a79c64f3facdd01cbd7502231688d77df" +checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -3786,9 +3738,9 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0efd8caf556a6cebd3b285caf480045fcc1ac04f6bd786b09a6f11af30c4fcf4" +checksum = "93107647184f6027e3b7dcb2e11034cf95ffa1e3a682c67951963ac69c1c007d" dependencies = [ "serde", ] @@ -3807,9 +3759,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "2.3.2" +version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "331bb8c3bf9b92457ab7abecf07078c13f7d270ba490103e84e8b014490cd0b0" +checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe" dependencies = [ "base64 0.13.1", "chrono", @@ -3823,14 +3775,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "2.3.2" +version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "859011bddcc11f289f07f467cc1fe01c7a941daa4d8f6c40d4d1c92eb6d9319c" +checksum = "881b6f881b17d13214e5d494c939ebab463d01264ce1811e9d4ac3a882e7695f" dependencies = [ "darling", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.16", ] [[package]] @@ -3944,9 +3896,9 @@ dependencies = [ [[package]] name = "socket2" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d283f86695ae989d1e18440a943880967156325ba025f05049946bff47bcc2b" +checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877" dependencies = [ "libc", "windows-sys 0.48.0", @@ -3986,7 +3938,7 @@ dependencies = [ "anyhow", "async-stream", "bytes", - "clap 4.2.2", + "clap 4.3.0", "const_format", "futures", "futures-core", @@ -4000,8 +3952,8 @@ dependencies = [ "prost", "tokio", "tokio-stream", - "tonic 0.9.1", - "tonic-build 0.9.1", + "tonic 0.9.2", + "tonic-build 0.9.2", "tracing", "utils", "workspace_hack", @@ -4044,9 +3996,9 @@ dependencies = [ [[package]] name = "subtle" -version = "2.4.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "svg_fmt" @@ -4067,9 +4019,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.15" +version = "2.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822" +checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01" dependencies = [ "proc-macro2", "quote", @@ -4123,7 +4075,7 @@ dependencies = [ "cfg-if", "fastrand", "redox_syscall 0.3.5", - "rustix 0.37.11", + "rustix 0.37.19", "windows-sys 0.45.0", ] @@ -4190,7 +4142,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -4205,9 +4157,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.20" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890" +checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc" dependencies = [ "itoa", "serde", @@ -4217,15 +4169,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" [[package]] name = "time-macros" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36" +checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b" dependencies = [ "time-core", ] @@ -4305,7 +4257,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.16", ] [[package]] @@ -4372,15 +4324,15 @@ version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" dependencies = [ - "rustls 0.21.0", + "rustls 0.21.1", "tokio", ] [[package]] name = "tokio-stream" -version = "0.1.12" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313" +checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" dependencies = [ "futures-core", "pin-project-lite", @@ -4415,9 +4367,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.7" +version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2" +checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" dependencies = [ "bytes", "futures-core", @@ -4429,9 +4381,9 @@ dependencies = [ [[package]] name = "toml" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b403acf6f2bb0859c93c7f0d967cb4a75a7ac552100f9322faf64dc047669b21" +checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec" dependencies = [ "serde", "serde_spanned", @@ -4441,18 +4393,18 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ab8ed2edee10b50132aed5f331333428b011c99402b5a534154ed15746f9622" +checksum = "5a76a9312f5ba4c2dec6b9161fdf25d87ad8a09256ccea5a556fef03c706a10f" dependencies = [ "serde", ] [[package]] name = "toml_edit" -version = "0.19.8" +version = "0.19.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "239410c8609e8125456927e6707163a3b1fdb40561e4b803bc041f466ccfdc13" +checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739" dependencies = [ "indexmap", "serde", @@ -4495,14 +4447,14 @@ dependencies = [ [[package]] name = "tonic" -version = "0.9.1" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38bd8e87955eb13c1986671838177d6792cdc52af9bffced0d2c8a9a7f741ab3" +checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a" dependencies = [ "async-stream", "async-trait", "axum", - "base64 0.21.0", + "base64 0.21.1", "bytes", "futures-core", "futures-util", @@ -4540,9 +4492,9 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.9.1" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f60a933bbea70c95d633c04c951197ddf084958abaa2ed502a3743bdd8d8dd7" +checksum = "a6fdaae4c2c638bb70fe42803a26fbd6fc6ac8c72f5c59f67ecc2a2dcabf4b07" dependencies = [ "prettyplease 0.1.25", "proc-macro2", @@ -4588,7 +4540,7 @@ name = "trace" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.2.2", + "clap 4.3.0", "pageserver_api", "utils", "workspace_hack", @@ -4609,20 +4561,20 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" +checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.16", ] [[package]] name = "tracing-core" -version = "0.1.30" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" +checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" dependencies = [ "once_cell", "valuable", @@ -4685,9 +4637,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.16" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70" +checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" dependencies = [ "matchers", "nu-ansi-term", @@ -4777,9 +4729,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-ident" -version = "1.0.8" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" +checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" [[package]] name = "unicode-normalization" @@ -4899,9 +4851,9 @@ dependencies = [ [[package]] name = "uuid" -version = "1.3.1" +version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b55a3fef2a1e3b3a00ce878640918820d3c51081576ac657d23af9fc7928fdb" +checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2" dependencies = [ "getrandom", "serde", @@ -4936,7 +4888,7 @@ name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.2.2", + "clap 4.3.0", "env_logger", "log", "once_cell", @@ -4974,9 +4926,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" +checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -4984,24 +4936,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" +checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.16", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.34" +version = "0.4.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454" +checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e" dependencies = [ "cfg-if", "js-sys", @@ -5011,9 +4963,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" +checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -5021,28 +4973,28 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" +checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.16", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" +checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93" [[package]] name = "web-sys" -version = "0.3.61" +version = "0.3.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33b99f4b23ba3eec1a53ac264e35a755f00e966e0065077d6027c0f575b0b97" +checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2" dependencies = [ "js-sys", "wasm-bindgen", @@ -5276,9 +5228,9 @@ checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" [[package]] name = "winnow" -version = "0.4.1" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae8970b36c66498d8ff1d66685dc86b91b29db0c7739899012f63a63814b4b28" +checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699" dependencies = [ "memchr", ] @@ -5299,7 +5251,7 @@ dependencies = [ "anyhow", "bytes", "chrono", - "clap 4.2.2", + "clap 4.3.0", "clap_builder", "crossbeam-utils", "either", @@ -5310,7 +5262,6 @@ dependencies = [ "futures-executor", "futures-sink", "futures-util", - "hashbrown 0.12.3", "itertools", "libc", "log", @@ -5322,7 +5273,7 @@ dependencies = [ "prost", "rand", "regex", - "regex-syntax", + "regex-syntax 0.7.2", "reqwest", "ring", "rustls 0.20.8", @@ -5331,7 +5282,7 @@ dependencies = [ "serde_json", "socket2 0.4.9", "syn 1.0.109", - "syn 2.0.15", + "syn 2.0.16", "tokio", "tokio-rustls 0.23.4", "tokio-util", diff --git a/Cargo.toml b/Cargo.toml index 7895459841..1cb8d65948 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,7 @@ members = [ "compute_tools", "control_plane", "pageserver", + "pageserver/ctl", "proxy", "safekeeper", "storage_broker", @@ -22,7 +23,7 @@ async-stream = "0.3" async-trait = "0.1" atty = "0.2.14" aws-config = { version = "0.55", default-features = false, features=["rustls"] } -aws-sdk-s3 = "0.25" +aws-sdk-s3 = "0.27" aws-smithy-http = "0.55" aws-credential-types = "0.55" aws-types = "0.55" diff --git a/Dockerfile b/Dockerfile index 7364654641..9467e41ae4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,8 +47,7 @@ RUN set -e \ && mold -run cargo build \ --bin pg_sni_router \ --bin pageserver \ - --bin pageserver_binutils \ - --bin draw_timeline_dir \ + --bin pagectl \ --bin safekeeper \ --bin storage_broker \ --bin proxy \ @@ -73,8 +72,7 @@ RUN set -e \ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin -COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin -COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 3a3dee8a8a..44e13a6c73 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -517,6 +517,22 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405 cargo pgx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control +######################################################################################### +# +# Layer "pg-pgx-ulid-build" +# Compile "pgx_ulid" extension +# +######################################################################################### + +FROM rust-extensions-build AS pg-pgx-ulid-build + +RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \ + echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \ + mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ + sed -i 's/pgx = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + cargo pgx install --release && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -547,6 +563,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -556,6 +573,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_utils \ + -s install && \ + make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/hnsw \ -s install ######################################################################################### @@ -632,6 +653,7 @@ RUN apt update && \ libxml2 \ libxslt1.1 \ libzstd1 \ + libcurl4-openssl-dev \ procps && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 diff --git a/Makefile b/Makefile index 9d78c5d0fc..ae979b8b4c 100644 --- a/Makefile +++ b/Makefile @@ -138,6 +138,11 @@ neon-pg-ext-%: postgres-% $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install + +@echo "Compiling hnsw $*" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$* + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install .PHONY: neon-pg-ext-clean-% neon-pg-ext-clean-%: @@ -153,6 +158,9 @@ neon-pg-ext-clean-%: $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ + -C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean .PHONY: neon-pg-ext neon-pg-ext: \ diff --git a/README.md b/README.md index 8e6f2cda81..efa714e5be 100644 --- a/README.md +++ b/README.md @@ -28,18 +28,19 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati * On Ubuntu or Debian, this set of packages should be sufficient to build the code: ```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ -libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler +libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \ +libcurl4-openssl-dev ``` * On Fedora, these packages are needed: ```bash dnf install flex bison readline-devel zlib-devel openssl-devel \ libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \ - protobuf-devel + protobuf-devel libcurl-devel ``` * On Arch based systems, these packages are needed: ```bash pacman -S base-devel readline zlib libseccomp openssl clang \ -postgresql-libs cmake postgresql protobuf +postgresql-libs cmake postgresql protobuf curl ``` Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases). diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 2f515c9bf1..c6cfde1d1a 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -59,6 +59,9 @@ fn main() -> Result<()> { let matches = cli().get_matches(); + let http_port = *matches + .get_one::("http-port") + .expect("http-port is required"); let pgdata = matches .get_one::("pgdata") .expect("PGDATA path is required"); @@ -178,7 +181,8 @@ fn main() -> Result<()> { // Launch http service first, so we were able to serve control-plane // requests, while configuration is still in progress. - let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread"); + let _http_handle = + launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread"); if !spec_set { // No spec provided, hang waiting for it. @@ -286,6 +290,14 @@ fn cli() -> clap::Command { let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown"); clap::Command::new("compute_ctl") .version(version) + .arg( + Arg::new("http-port") + .long("http-port") + .value_name("HTTP_PORT") + .default_value("3080") + .value_parser(clap::value_parser!(u16)) + .required(false), + ) .arg( Arg::new("connstr") .short('C') diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index da5ad00da6..617b330704 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,19 +1,3 @@ -// -// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`, -// but there are several things that makes `PostgresNode` usage inconvenient in the -// cloud: -// - it inherits from `LocalEnv`, which contains **all-all** the information about -// a complete service running -// - it uses `PageServerNode` with information about http endpoint, which we do not -// need in the cloud again -// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud -// -// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required -// attributes (not required for the cloud). Yet, it is still tempting to unify these -// `PostgresNode` and `ComputeNode` and use one in both places. -// -// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`. -// use std::fs; use std::os::unix::fs::PermissionsExt; use std::path::Path; @@ -106,26 +90,38 @@ pub struct ParsedSpec { impl TryFrom for ParsedSpec { type Error = String; fn try_from(spec: ComputeSpec) -> Result { + // Extract the options from the spec file that are needed to connect to + // the storage system. + // + // For backwards-compatibility, the top-level fields in the spec file + // may be empty. In that case, we need to dig them from the GUCs in the + // cluster.settings field. let pageserver_connstr = spec - .cluster - .settings - .find("neon.pageserver_connstring") + .pageserver_connstring + .clone() + .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring")) .ok_or("pageserver connstr should be provided")?; let storage_auth_token = spec.storage_auth_token.clone(); - let tenant_id: TenantId = spec - .cluster - .settings - .find("neon.tenant_id") - .ok_or("tenant id should be provided") - .map(|s| TenantId::from_str(&s))? - .or(Err("invalid tenant id"))?; - let timeline_id: TimelineId = spec - .cluster - .settings - .find("neon.timeline_id") - .ok_or("timeline id should be provided") - .map(|s| TimelineId::from_str(&s))? - .or(Err("invalid timeline id"))?; + let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id { + tenant_id + } else { + spec.cluster + .settings + .find("neon.tenant_id") + .ok_or("tenant id should be provided") + .map(|s| TenantId::from_str(&s))? + .or(Err("invalid tenant id"))? + }; + let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id { + timeline_id + } else { + spec.cluster + .settings + .find("neon.timeline_id") + .ok_or("timeline id should be provided") + .map(|s| TimelineId::from_str(&s))? + .or(Err("invalid timeline id"))? + }; Ok(ParsedSpec { spec, @@ -295,8 +291,8 @@ impl ComputeNode { update_pg_hba(pgdata_path)?; match spec.mode { - ComputeMode::Primary | ComputeMode::Static(..) => {} - ComputeMode::Replica => { + ComputeMode::Primary => {} + ComputeMode::Replica | ComputeMode::Static(..) => { add_standby_signal(pgdata_path)?; } } @@ -362,6 +358,8 @@ impl ComputeNode { }; // Proceed with post-startup configuration. Note, that order of operations is important. + // Disable DDL forwarding because control plane already knows about these roles/databases. + client.simple_query("SET neon.forward_ddl = false")?; let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; handle_roles(spec, &mut client)?; handle_databases(spec, &mut client)?; @@ -374,7 +372,7 @@ impl ComputeNode { info!( "finished configuration of compute for project {}", - spec.cluster.cluster_id + spec.cluster.cluster_id.as_deref().unwrap_or("None") ); Ok(()) @@ -403,7 +401,9 @@ impl ComputeNode { self.pg_reload_conf(&mut client)?; // Proceed with post-startup configuration. Note, that order of operations is important. + // Disable DDL forwarding because control plane already knows about these roles/databases. if spec.mode == ComputeMode::Primary { + client.simple_query("SET neon.forward_ddl = false")?; handle_roles(&spec, &mut client)?; handle_databases(&spec, &mut client)?; handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; @@ -430,7 +430,7 @@ impl ComputeNode { let spec = compute_state.pspec.as_ref().expect("spec must be set"); info!( "starting compute for project {}, operation {}, tenant {}, timeline {}", - spec.spec.cluster.cluster_id, + spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), spec.spec.operation_uuid.as_deref().unwrap_or("None"), spec.tenant_id, spec.timeline_id, diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 1168f3876a..99346433d0 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -5,6 +5,7 @@ use std::path::Path; use anyhow::Result; +use crate::pg_helpers::escape_conf_value; use crate::pg_helpers::PgOptionsSerialize; use compute_api::spec::{ComputeMode, ComputeSpec}; @@ -36,10 +37,44 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> { // File::create() destroys the file content if it exists. let mut file = File::create(path)?; - writeln!(file, "# Managed by compute_ctl: begin")?; + // Write the postgresql.conf content from the spec file as is. + if let Some(conf) = &spec.cluster.postgresql_conf { + writeln!(file, "{}", conf)?; + } write!(file, "{}", &spec.cluster.settings.as_pg_settings())?; + // Add options for connecting to storage + writeln!(file, "# Neon storage settings")?; + if let Some(s) = &spec.pageserver_connstring { + writeln!( + file, + "neon.pageserver_connstring='{}'", + escape_conf_value(s) + )?; + } + if !spec.safekeeper_connstrings.is_empty() { + writeln!( + file, + "neon.safekeepers='{}'", + escape_conf_value(&spec.safekeeper_connstrings.join(",")) + )?; + } + if let Some(s) = &spec.tenant_id { + writeln!( + file, + "neon.tenant_id='{}'", + escape_conf_value(&s.to_string()) + )?; + } + if let Some(s) = &spec.timeline_id { + writeln!( + file, + "neon.timeline_id='{}'", + escape_conf_value(&s.to_string()) + )?; + } + match spec.mode { ComputeMode::Primary => {} ComputeMode::Static(lsn) => { @@ -53,7 +88,12 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> { } } - writeln!(file, "# Managed by compute_ctl: end")?; + // If there are any extra options in the 'settings' field, append those + if spec.cluster.settings.is_some() { + writeln!(file, "# Managed by compute_ctl: begin")?; + write!(file, "{}", spec.cluster.settings.as_pg_settings())?; + writeln!(file, "# Managed by compute_ctl: end")?; + } Ok(()) } diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 4468f6f5e4..afd9c2fb54 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -220,8 +220,8 @@ fn render_json_error(e: &str, status: StatusCode) -> Response { // Main Hyper HTTP server function that runs it and blocks waiting on it forever. #[tokio::main] -async fn serve(state: Arc) { - let addr = SocketAddr::from(([0, 0, 0, 0], 3080)); +async fn serve(port: u16, state: Arc) { + let addr = SocketAddr::from(([0, 0, 0, 0], port)); let make_service = make_service_fn(move |_conn| { let state = state.clone(); @@ -256,10 +256,10 @@ async fn serve(state: Arc) { } /// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`. -pub fn launch_http_server(state: &Arc) -> Result> { +pub fn launch_http_server(port: u16, state: &Arc) -> Result> { let state = Arc::clone(state); Ok(thread::Builder::new() .name("http-endpoint".into()) - .spawn(move || serve(state))?) + .spawn(move || serve(port, state))?) } diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index 1b5cf647b0..f6fc882968 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -33,5 +33,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { .init(); tracing::info!("logging and tracing started"); + utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); + Ok(()) } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 40dbea6907..d5c845e9ea 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -23,7 +23,7 @@ fn escape_literal(s: &str) -> String { /// Escape a string so that it can be used in postgresql.conf. /// Same as escape_literal, currently. -fn escape_conf_value(s: &str) -> String { +pub fn escape_conf_value(s: &str) -> String { s.replace('\'', "''").replace('\\', "\\\\") } @@ -121,9 +121,8 @@ impl RoleExt for Role { /// string of arguments. fn to_pg_options(&self) -> String { // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane. - // For now, we do not use generic `options` for roles. Once used, add - // `self.options.as_pg_options()` somewhere here. - let mut params: String = "LOGIN".to_string(); + let mut params: String = self.options.as_pg_options(); + params.push_str(" LOGIN"); if let Some(pass) = &self.encrypted_password { // Some time ago we supported only md5 and treated all encrypted_password as md5. diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index bf3c407202..a2a19ae0da 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -62,7 +62,7 @@ fn do_control_plane_request( } } -/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT` +/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN` /// env variable is set, it will be used for authorization. pub fn get_spec_from_control_plane( base_uri: &str, diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index a63ee038c7..265556d3b9 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -16,7 +16,7 @@ mod pg_helpers_tests { ); assert_eq!( spec.cluster.roles.first().unwrap().to_pg_options(), - "LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'" + " LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'" ); } diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 30880565ab..52af936d7b 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -41,7 +41,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); -const DEFAULT_PG_VERSION: &str = "14"; +const DEFAULT_PG_VERSION: &str = "15"; fn default_conf() -> String { format!( @@ -476,10 +476,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - println!("Creating endpoint for imported timeline ..."); cplane.new_endpoint( - tenant_id, name, + tenant_id, timeline_id, None, + None, pg_version, ComputeMode::Primary, )?; @@ -591,7 +592,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( table.add_row([ endpoint_id.as_str(), - &endpoint.address.to_string(), + &endpoint.pg_address.to_string(), &endpoint.timeline_id.to_string(), branch_name, lsn_str.as_str(), @@ -620,8 +621,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .get_branch_timeline_id(branch_name, tenant_id) .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?; - let port: Option = sub_args.get_one::("port").copied(); - + let pg_port: Option = sub_args.get_one::("pg-port").copied(); + let http_port: Option = sub_args.get_one::("http-port").copied(); let pg_version = sub_args .get_one::("pg-version") .copied() @@ -639,14 +640,38 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), }; - cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?; + cplane.new_endpoint( + &endpoint_id, + tenant_id, + timeline_id, + pg_port, + http_port, + pg_version, + mode, + )?; } "start" => { - let port: Option = sub_args.get_one::("port").copied(); + let pg_port: Option = sub_args.get_one::("pg-port").copied(); + let http_port: Option = sub_args.get_one::("http-port").copied(); let endpoint_id = sub_args .get_one::("endpoint_id") .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?; + // If --safekeepers argument is given, use only the listed safekeeper nodes. + let safekeepers = + if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { + let mut safekeepers: Vec = Vec::new(); + for sk_id in safekeepers_str.split(',').map(str::trim) { + let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| { + anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list") + })?); + safekeepers.push(sk_id); + } + safekeepers + } else { + env.safekeepers.iter().map(|sk| sk.id).collect() + }; + let endpoint = cplane.endpoints.get(endpoint_id.as_str()); let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) { @@ -673,7 +698,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( _ => {} } println!("Starting existing endpoint {endpoint_id}..."); - endpoint.start(&auth_token)?; + endpoint.start(&auth_token, safekeepers)?; } else { let branch_name = sub_args .get_one::("branch-name") @@ -709,14 +734,15 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ..."); let ep = cplane.new_endpoint( - tenant_id, endpoint_id, + tenant_id, timeline_id, - port, + pg_port, + http_port, pg_version, mode, )?; - ep.start(&auth_token)?; + ep.start(&auth_token, safekeepers)?; } } "stop" => { @@ -944,11 +970,22 @@ fn cli() -> Command { .value_parser(value_parser!(u32)) .default_value(DEFAULT_PG_VERSION); - let port_arg = Arg::new("port") - .long("port") + let pg_port_arg = Arg::new("pg-port") + .long("pg-port") .required(false) .value_parser(value_parser!(u16)) - .value_name("port"); + .value_name("pg-port"); + + let http_port_arg = Arg::new("http-port") + .long("http-port") + .required(false) + .value_parser(value_parser!(u16)) + .value_name("http-port"); + + let safekeepers_arg = Arg::new("safekeepers") + .long("safekeepers") + .required(false) + .value_name("safekeepers"); let stop_mode_arg = Arg::new("stop-mode") .short('m') @@ -1093,7 +1130,8 @@ fn cli() -> Command { .arg(branch_name_arg.clone()) .arg(tenant_id_arg.clone()) .arg(lsn_arg.clone()) - .arg(port_arg.clone()) + .arg(pg_port_arg.clone()) + .arg(http_port_arg.clone()) .arg( Arg::new("config-only") .help("Don't do basebackup, create endpoint directory with only config files") @@ -1109,9 +1147,11 @@ fn cli() -> Command { .arg(branch_name_arg) .arg(timeline_id_arg) .arg(lsn_arg) - .arg(port_arg) + .arg(pg_port_arg) + .arg(http_port_arg) .arg(pg_version_arg) .arg(hot_standby_arg) + .arg(safekeepers_arg) ) .subcommand( Command::new("stop") diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs index 6c0604a076..ad19dfa204 100644 --- a/control_plane/src/broker.rs +++ b/control_plane/src/broker.rs @@ -1,3 +1,9 @@ +//! Code to manage the storage broker +//! +//! In the local test environment, the data for each safekeeper is stored in +//! +//! .neon/safekeepers/ +//! use anyhow::Context; use std::path::PathBuf; diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index cc5a7a4168..b28315a35d 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -1,40 +1,71 @@ +//! Code to manage compute endpoints +//! +//! In the local test environment, the data for each endpoint is stored in +//! +//! .neon/endpoints/ +//! +//! Some basic information about the endpoint, like the tenant and timeline IDs, +//! are stored in the `endpoint.json` file. The `endpoint.json` file is created +//! when the endpoint is created, and doesn't change afterwards. +//! +//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is +//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads +//! the basebackup from the pageserver to initialize the the data directory, and +//! finally launches the PostgreSQL process. It watches the PostgreSQL process +//! until it exits. +//! +//! When an endpoint is created, a `postgresql.conf` file is also created in +//! the endpoint's directory. The file can be modified before starting PostgreSQL. +//! However, the `postgresql.conf` file in the endpoint directory is not used directly +//! by PostgreSQL. It is passed to `compute_ctl`, and `compute_ctl` writes another +//! copy of it in the data directory. +//! +//! Directory contents: +//! +//! ```ignore +//! .neon/endpoints/main/ +//! compute.log - log output of `compute_ctl` and `postgres` +//! endpoint.json - serialized `EndpointConf` struct +//! postgresql.conf - postgresql settings +//! spec.json - passed to `compute_ctl` +//! pgdata/ +//! postgresql.conf - copy of postgresql.conf created by `compute_ctl` +//! zenith.signal +//! +//! ``` +//! use std::collections::BTreeMap; -use std::fs::{self, File}; -use std::io::Write; use std::net::SocketAddr; use std::net::TcpStream; -use std::os::unix::fs::PermissionsExt; use std::path::PathBuf; -use std::process::{Command, Stdio}; -use std::str::FromStr; +use std::process::Command; use std::sync::Arc; use std::time::Duration; -use anyhow::{Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{NodeId, TenantId, TimelineId}; use crate::local_env::LocalEnv; use crate::pageserver::PageServerNode; use crate::postgresql_conf::PostgresConf; -use compute_api::spec::ComputeMode; +use compute_api::responses::{ComputeState, ComputeStatus}; +use compute_api::spec::{Cluster, ComputeMode, ComputeSpec}; // contents of a endpoint.json file #[serde_as] #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] pub struct EndpointConf { - name: String, + endpoint_id: String, #[serde_as(as = "DisplayFromStr")] tenant_id: TenantId, #[serde_as(as = "DisplayFromStr")] timeline_id: TimelineId, mode: ComputeMode, - port: u16, + pg_port: u16, + http_port: u16, pg_version: u32, } @@ -57,11 +88,11 @@ impl ComputeControlPlane { let pageserver = Arc::new(PageServerNode::from_env(&env)); let mut endpoints = BTreeMap::default(); - for endpoint_dir in fs::read_dir(env.endpoints_path()) + for endpoint_dir in std::fs::read_dir(env.endpoints_path()) .with_context(|| format!("failed to list {}", env.endpoints_path().display()))? { let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?; - endpoints.insert(ep.name.clone(), Arc::new(ep)); + endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep)); } Ok(ComputeControlPlane { @@ -76,25 +107,28 @@ impl ComputeControlPlane { 1 + self .endpoints .values() - .map(|ep| ep.address.port()) + .map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port())) .max() .unwrap_or(self.base_port) } + #[allow(clippy::too_many_arguments)] pub fn new_endpoint( &mut self, + endpoint_id: &str, tenant_id: TenantId, - name: &str, timeline_id: TimelineId, - port: Option, + pg_port: Option, + http_port: Option, pg_version: u32, mode: ComputeMode, ) -> Result> { - let port = port.unwrap_or_else(|| self.get_port()); - + let pg_port = pg_port.unwrap_or_else(|| self.get_port()); + let http_port = http_port.unwrap_or_else(|| self.get_port() + 1); let ep = Arc::new(Endpoint { - name: name.to_owned(), - address: SocketAddr::new("127.0.0.1".parse().unwrap(), port), + endpoint_id: endpoint_id.to_owned(), + pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port), + http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port), env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), timeline_id, @@ -102,21 +136,27 @@ impl ComputeControlPlane { tenant_id, pg_version, }); - ep.create_pgdata()?; + + ep.create_endpoint_dir()?; std::fs::write( ep.endpoint_path().join("endpoint.json"), serde_json::to_string_pretty(&EndpointConf { - name: name.to_string(), + endpoint_id: endpoint_id.to_string(), tenant_id, timeline_id, mode, - port, + http_port, + pg_port, pg_version, })?, )?; - ep.setup_pg_conf()?; + std::fs::write( + ep.endpoint_path().join("postgresql.conf"), + ep.setup_pg_conf()?.to_string(), + )?; - self.endpoints.insert(ep.name.clone(), Arc::clone(&ep)); + self.endpoints + .insert(ep.endpoint_id.clone(), Arc::clone(&ep)); Ok(ep) } @@ -127,13 +167,15 @@ impl ComputeControlPlane { #[derive(Debug)] pub struct Endpoint { /// used as the directory name - name: String, + endpoint_id: String, pub tenant_id: TenantId, pub timeline_id: TimelineId, pub mode: ComputeMode, - // port and address of the Postgres server - pub address: SocketAddr, + // port and address of the Postgres server and `compute_ctl`'s HTTP API + pub pg_address: SocketAddr, + pub http_address: SocketAddr, + // postgres major version in the format: 14, 15, etc. pg_version: u32, @@ -158,16 +200,16 @@ impl Endpoint { // parse data directory name let fname = entry.file_name(); - let name = fname.to_str().unwrap().to_string(); + let endpoint_id = fname.to_str().unwrap().to_string(); // Read the endpoint.json file let conf: EndpointConf = serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; - // ok now Ok(Endpoint { - address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port), - name, + pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port), + http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port), + endpoint_id, env: env.clone(), pageserver: Arc::clone(pageserver), timeline_id: conf.timeline_id, @@ -177,104 +219,17 @@ impl Endpoint { }) } - fn sync_safekeepers(&self, auth_token: &Option, pg_version: u32) -> Result { - let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres"); - let mut cmd = Command::new(pg_path); - - cmd.arg("--sync-safekeepers") - .env_clear() - .env( - "LD_LIBRARY_PATH", - self.env.pg_lib_dir(pg_version)?.to_str().unwrap(), - ) - .env( - "DYLD_LIBRARY_PATH", - self.env.pg_lib_dir(pg_version)?.to_str().unwrap(), - ) - .env("PGDATA", self.pgdata().to_str().unwrap()) - .stdout(Stdio::piped()) - // Comment this to avoid capturing stderr (useful if command hangs) - .stderr(Stdio::piped()); - - if let Some(token) = auth_token { - cmd.env("NEON_AUTH_TOKEN", token); - } - - let sync_handle = cmd - .spawn() - .expect("postgres --sync-safekeepers failed to start"); - - let sync_output = sync_handle - .wait_with_output() - .expect("postgres --sync-safekeepers failed"); - if !sync_output.status.success() { - anyhow::bail!( - "sync-safekeepers failed: '{}'", - String::from_utf8_lossy(&sync_output.stderr) - ); - } - - let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?; - println!("Safekeepers synced on {}", lsn); - Ok(lsn) - } - - /// Get basebackup from the pageserver as a tar archive and extract it - /// to the `self.pgdata()` directory. - fn do_basebackup(&self, lsn: Option) -> Result<()> { - println!( - "Extracting base backup to create postgres instance: path={} port={}", - self.pgdata().display(), - self.address.port() - ); - - let sql = if let Some(lsn) = lsn { - format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn) - } else { - format!("basebackup {} {}", self.tenant_id, self.timeline_id) - }; - - let mut client = self - .pageserver - .page_server_psql_client() - .context("connecting to page server failed")?; - - let copyreader = client - .copy_out(sql.as_str()) - .context("page server 'basebackup' command failed")?; - - // Read the archive directly from the `CopyOutReader` - // - // Set `ignore_zeros` so that unpack() reads all the Copy data and - // doesn't stop at the end-of-archive marker. Otherwise, if the server - // sends an Error after finishing the tarball, we will not notice it. - let mut ar = tar::Archive::new(copyreader); - ar.set_ignore_zeros(true); - ar.unpack(&self.pgdata()) - .context("extracting base backup failed")?; - - Ok(()) - } - - fn create_pgdata(&self) -> Result<()> { - fs::create_dir_all(self.pgdata()).with_context(|| { + fn create_endpoint_dir(&self) -> Result<()> { + std::fs::create_dir_all(self.endpoint_path()).with_context(|| { format!( - "could not create data directory {}", - self.pgdata().display() + "could not create endpoint directory {}", + self.endpoint_path().display() ) - })?; - fs::set_permissions(self.pgdata().as_path(), fs::Permissions::from_mode(0o700)) - .with_context(|| { - format!( - "could not set permissions in data directory {}", - self.pgdata().display() - ) - }) + }) } - // Write postgresql.conf with default configuration - // and PG_VERSION file to the data directory of a new endpoint. - fn setup_pg_conf(&self) -> Result<()> { + // Generate postgresql.conf with default configuration + fn setup_pg_conf(&self) -> Result { let mut conf = PostgresConf::new(); conf.append("max_wal_senders", "10"); conf.append("wal_log_hints", "off"); @@ -287,25 +242,14 @@ impl Endpoint { // wal_sender_timeout is the maximum time to wait for WAL replication. // It also defines how often the walreciever will send a feedback message to the wal sender. conf.append("wal_sender_timeout", "5s"); - conf.append("listen_addresses", &self.address.ip().to_string()); - conf.append("port", &self.address.port().to_string()); + conf.append("listen_addresses", &self.pg_address.ip().to_string()); + conf.append("port", &self.pg_address.port().to_string()); conf.append("wal_keep_size", "0"); // walproposer panics when basebackup is invalid, it is pointless to restart in this case. conf.append("restart_after_crash", "off"); - // Configure the Neon Postgres extension to fetch pages from pageserver - let pageserver_connstr = { - let config = &self.pageserver.pg_connection_config; - let (host, port) = (config.host(), config.port()); - - // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere. - format!("postgresql://no_user@{host}:{port}") - }; + // Load the 'neon' extension conf.append("shared_preload_libraries", "neon"); - conf.append_line(""); - conf.append("neon.pageserver_connstring", &pageserver_connstr); - conf.append("neon.tenant_id", &self.tenant_id.to_string()); - conf.append("neon.timeline_id", &self.timeline_id.to_string()); conf.append_line(""); // Replication-related configurations, such as WAL sending @@ -390,46 +334,11 @@ impl Endpoint { } } - let mut file = File::create(self.pgdata().join("postgresql.conf"))?; - file.write_all(conf.to_string().as_bytes())?; - - let mut file = File::create(self.pgdata().join("PG_VERSION"))?; - file.write_all(self.pg_version.to_string().as_bytes())?; - - Ok(()) - } - - fn load_basebackup(&self, auth_token: &Option) -> Result<()> { - let backup_lsn = match &self.mode { - ComputeMode::Primary => { - if !self.env.safekeepers.is_empty() { - // LSN 0 means that it is bootstrap and we need to download just - // latest data from the pageserver. That is a bit clumsy but whole bootstrap - // procedure evolves quite actively right now, so let's think about it again - // when things would be more stable (TODO). - let lsn = self.sync_safekeepers(auth_token, self.pg_version)?; - if lsn == Lsn(0) { - None - } else { - Some(lsn) - } - } else { - None - } - } - ComputeMode::Static(lsn) => Some(*lsn), - ComputeMode::Replica => { - None // Take the latest snapshot available to start with - } - }; - - self.do_basebackup(backup_lsn)?; - - Ok(()) + Ok(conf) } pub fn endpoint_path(&self) -> PathBuf { - self.env.endpoints_path().join(&self.name) + self.env.endpoints_path().join(&self.endpoint_id) } pub fn pgdata(&self) -> PathBuf { @@ -439,7 +348,7 @@ impl Endpoint { pub fn status(&self) -> &str { let timeout = Duration::from_millis(300); let has_pidfile = self.pgdata().join("postmaster.pid").exists(); - let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok(); + let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok(); match (has_pidfile, can_connect) { (true, true) => "running", @@ -457,8 +366,6 @@ impl Endpoint { &[ "-D", self.pgdata().to_str().unwrap(), - "-l", - self.pgdata().join("pg.log").to_str().unwrap(), "-w", //wait till pg_ctl actually does what was asked ], args, @@ -494,36 +401,183 @@ impl Endpoint { Ok(()) } - pub fn start(&self, auth_token: &Option) -> Result<()> { + pub fn start(&self, auth_token: &Option, safekeepers: Vec) -> Result<()> { if self.status() == "running" { anyhow::bail!("The endpoint is already running"); } - // 1. We always start Postgres from scratch, so - // if old dir exists, preserve 'postgresql.conf' and drop the directory - let postgresql_conf_path = self.pgdata().join("postgresql.conf"); - let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| { - format!( - "failed to read config file in {}", - postgresql_conf_path.to_str().unwrap() - ) - })?; - fs::remove_dir_all(self.pgdata())?; - self.create_pgdata()?; + // Slurp the endpoints//postgresql.conf file into + // memory. We will include it in the spec file that we pass to + // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf + // in the data directory. + let postgresql_conf_path = self.endpoint_path().join("postgresql.conf"); + let postgresql_conf = match std::fs::read(&postgresql_conf_path) { + Ok(content) => String::from_utf8(content)?, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(), + Err(e) => { + return Err(anyhow::Error::new(e).context(format!( + "failed to read config file in {}", + postgresql_conf_path.to_str().unwrap() + ))) + } + }; - // 2. Bring back config files - fs::write(&postgresql_conf_path, postgresql_conf)?; - - // 3. Load basebackup - self.load_basebackup(auth_token)?; - - if self.mode != ComputeMode::Primary { - File::create(self.pgdata().join("standby.signal"))?; + // We always start the compute node from scratch, so if the Postgres + // data dir exists from a previous launch, remove it first. + if self.pgdata().exists() { + std::fs::remove_dir_all(self.pgdata())?; } - // 4. Finally start postgres - println!("Starting postgres at '{}'", self.connstr()); - self.pg_ctl(&["start"], auth_token) + let pageserver_connstring = { + let config = &self.pageserver.pg_connection_config; + let (host, port) = (config.host(), config.port()); + + // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere. + format!("postgresql://no_user@{host}:{port}") + }; + let mut safekeeper_connstrings = Vec::new(); + if self.mode == ComputeMode::Primary { + for sk_id in safekeepers { + let sk = self + .env + .safekeepers + .iter() + .find(|node| node.id == sk_id) + .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?; + safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port)); + } + } + + // Create spec file + let spec = ComputeSpec { + format_version: 1.0, + operation_uuid: None, + cluster: Cluster { + cluster_id: None, // project ID: not used + name: None, // project name: not used + state: None, + roles: vec![], + databases: vec![], + settings: None, + postgresql_conf: Some(postgresql_conf), + }, + delta_operations: None, + tenant_id: Some(self.tenant_id), + timeline_id: Some(self.timeline_id), + mode: self.mode, + pageserver_connstring: Some(pageserver_connstring), + safekeeper_connstrings, + storage_auth_token: auth_token.clone(), + }; + let spec_path = self.endpoint_path().join("spec.json"); + std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; + + // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it. + let logfile = std::fs::OpenOptions::new() + .create(true) + .append(true) + .open(self.endpoint_path().join("compute.log"))?; + + // Launch compute_ctl + println!("Starting postgres node at '{}'", self.connstr()); + let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); + cmd.args(["--http-port", &self.http_address.port().to_string()]) + .args(["--pgdata", self.pgdata().to_str().unwrap()]) + .args(["--connstr", &self.connstr()]) + .args([ + "--spec-path", + self.endpoint_path().join("spec.json").to_str().unwrap(), + ]) + .args([ + "--pgbin", + self.env + .pg_bin_dir(self.pg_version)? + .join("postgres") + .to_str() + .unwrap(), + ]) + .stdin(std::process::Stdio::null()) + .stderr(logfile.try_clone()?) + .stdout(logfile); + let _child = cmd.spawn()?; + + // Wait for it to start + let mut attempt = 0; + const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100); + const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s + loop { + attempt += 1; + match self.get_status() { + Ok(state) => { + match state.status { + ComputeStatus::Init => { + if attempt == MAX_ATTEMPTS { + bail!("compute startup timed out; still in Init state"); + } + // keep retrying + } + ComputeStatus::Running => { + // All good! + break; + } + ComputeStatus::Failed => { + bail!( + "compute startup failed: {}", + state + .error + .as_deref() + .unwrap_or("") + ); + } + ComputeStatus::Empty + | ComputeStatus::ConfigurationPending + | ComputeStatus::Configuration => { + bail!("unexpected compute status: {:?}", state.status) + } + } + } + Err(e) => { + if attempt == MAX_ATTEMPTS { + return Err(e).context( + "timed out waiting to connect to compute_ctl HTTP; last error: {e}", + ); + } + } + } + std::thread::sleep(ATTEMPT_INTERVAL); + } + + Ok(()) + } + + // Call the /status HTTP API + pub fn get_status(&self) -> Result { + let client = reqwest::blocking::Client::new(); + + let response = client + .request( + reqwest::Method::GET, + format!( + "http://{}:{}/status", + self.http_address.ip(), + self.http_address.port() + ), + ) + .send()?; + + // Interpret the response + let status = response.status(); + if !(status.is_client_error() || status.is_server_error()) { + Ok(response.json()?) + } else { + // reqwest does not export its error construction utility functions, so let's craft the message ourselves + let url = response.url().to_owned(); + let msg = match response.text() { + Ok(err_body) => format!("Error: {}", err_body), + Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), + }; + Err(anyhow::anyhow!(msg)) + } } pub fn stop(&self, destroy: bool) -> Result<()> { @@ -540,7 +594,7 @@ impl Endpoint { "Destroying postgres data directory '{}'", self.pgdata().to_str().unwrap() ); - fs::remove_dir_all(self.endpoint_path())?; + std::fs::remove_dir_all(self.endpoint_path())?; } else { self.pg_ctl(&["stop"], &None)?; } @@ -549,10 +603,10 @@ impl Endpoint { pub fn connstr(&self) -> String { format!( - "host={} port={} user={} dbname={}", - self.address.ip(), - self.address.port(), + "postgresql://{}@{}:{}/{}", "cloud_admin", + self.pg_address.ip(), + self.pg_address.port(), "postgres" ) } diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 2b1eec7c4b..df70cb3139 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -24,7 +24,7 @@ use utils::{ use crate::safekeeper::SafekeeperNode; -pub const DEFAULT_PG_VERSION: u32 = 14; +pub const DEFAULT_PG_VERSION: u32 = 15; // // This data structures represents neon_local CLI config @@ -37,7 +37,7 @@ pub const DEFAULT_PG_VERSION: u32 = 14; #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] pub struct LocalEnv { // Base directory for all the nodes (the pageserver, safekeepers and - // compute nodes). + // compute endpoints). // // This is not stored in the config file. Rather, this is the path where the // config file itself is. It is read from the NEON_REPO_DIR env variable or diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 6309494b71..2ff09021e5 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -1,3 +1,9 @@ +//! Code to manage pageservers +//! +//! In the local test environment, the pageserver stores its data directly in +//! +//! .neon/ +//! use std::borrow::Cow; use std::collections::HashMap; use std::fs::File; @@ -369,7 +375,16 @@ impl PageServerNode { evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") .map(|x| x.to_string()), + gc_feedback: settings + .remove("gc_feedback") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_feedback' as bool")?, }; + + // If tenant ID was not specified, generate one + let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate()); + let request = models::TenantCreateRequest { new_tenant_id, config, @@ -459,6 +474,11 @@ impl PageServerNode { evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") .map(|x| x.to_string()), + gc_feedback: settings + .remove("gc_feedback") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_feedback' as bool")?, } }; @@ -495,6 +515,9 @@ impl PageServerNode { ancestor_timeline_id: Option, pg_version: Option, ) -> anyhow::Result { + // If timeline ID was not specified, generate one + let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate()); + self.http_request( Method::POST, format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index d358f73343..9e053ff1f1 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -1,3 +1,9 @@ +//! Code to manage safekeepers +//! +//! In the local test environment, the data for each safekeeper is stored in +//! +//! .neon/safekeepers/ +//! use std::io::Write; use std::path::PathBuf; use std::process::Child; diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index cef2b485f3..22660a63ce 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -1,6 +1,14 @@ #!/bin/bash set -eux +# Generate a random tenant or timeline ID +# +# Takes a variable name as argument. The result is stored in that variable. +generate_id() { + local -n resvar=$1 + printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM +} + PG_VERSION=${PG_VERSION:-14} SPEC_FILE_ORG=/var/db/postgres/specs/spec.json @@ -13,29 +21,29 @@ done echo "Page server is ready." echo "Create a tenant and timeline" +generate_id tenant_id PARAMS=( -sb -X POST -H "Content-Type: application/json" - -d "{}" + -d "{\"new_tenant_id\": \"${tenant_id}\"}" http://pageserver:9898/v1/tenant/ ) -tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g') +result=$(curl "${PARAMS[@]}") +echo $result | jq . +generate_id timeline_id PARAMS=( -sb -X POST -H "Content-Type: application/json" - -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}" + -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}" "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/" ) result=$(curl "${PARAMS[@]}") echo $result | jq . echo "Overwrite tenant id and timeline id in spec file" -tenant_id=$(echo ${result} | jq -r .tenant_id) -timeline_id=$(echo ${result} | jq -r .timeline_id) - sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE} sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE} diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md index 0cc897f154..b911933528 100644 --- a/docs/pageserver-thread-mgmt.md +++ b/docs/pageserver-thread-mgmt.md @@ -52,9 +52,7 @@ completion, or shield the rest of the code from surprise cancellations by spawning a separate task. The code that handles incoming HTTP requests, for example, spawns a separate task for each request, because Hyper will drop the request-handling Future if the HTTP -connection is lost. (FIXME: our HTTP handlers do not do that -currently, but we should fix that. See [issue -3478](https://github.com/neondatabase/neon/issues/3478)). +connection is lost. #### How to cancel, then? diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index d181c018b1..ce73dda08a 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -5,13 +5,13 @@ use serde::{Deserialize, Serialize, Serializer}; use crate::spec::ComputeSpec; -#[derive(Serialize, Debug)] +#[derive(Serialize, Debug, Deserialize)] pub struct GenericAPIError { pub error: String, } /// Response of the /status API -#[derive(Serialize, Debug)] +#[derive(Serialize, Debug, Deserialize)] #[serde(rename_all = "snake_case")] pub struct ComputeStatusResponse { pub start_time: DateTime, @@ -23,7 +23,7 @@ pub struct ComputeStatusResponse { pub error: Option, } -#[derive(Serialize)] +#[derive(Deserialize, Serialize)] #[serde(rename_all = "snake_case")] pub struct ComputeState { pub status: ComputeStatus, @@ -33,7 +33,7 @@ pub struct ComputeState { pub error: Option, } -#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)] +#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)] #[serde(rename_all = "snake_case")] pub enum ComputeStatus { // Spec wasn't provided at start, waiting for it to be diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 6072980ed8..4014774a7e 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -5,6 +5,7 @@ //! and connect it to the storage nodes. use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; +use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; /// String type alias representing Postgres identifier and @@ -14,7 +15,7 @@ pub type PgIdent = String; /// Cluster spec or configuration represented as an optional number of /// delta operations + final cluster state description. #[serde_as] -#[derive(Clone, Debug, Default, Deserialize)] +#[derive(Clone, Debug, Default, Deserialize, Serialize)] pub struct ComputeSpec { pub format_version: f32, @@ -26,9 +27,32 @@ pub struct ComputeSpec { pub cluster: Cluster, pub delta_operations: Option>, + // Information needed to connect to the storage layer. + // + // `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed. + // + // Depending on `mode`, this can be a primary read-write node, a read-only + // replica, or a read-only node pinned at an older LSN. + // `safekeeper_connstrings` must be set for a primary. + // + // For backwards compatibility, the control plane may leave out all of + // these, and instead set the "neon.tenant_id", "neon.timeline_id", + // etc. GUCs in cluster.settings. TODO: Once the control plane has been + // updated to fill these fields, we can make these non optional. + #[serde_as(as = "Option")] + pub tenant_id: Option, + #[serde_as(as = "Option")] + pub timeline_id: Option, + #[serde_as(as = "Option")] + pub pageserver_connstring: Option, + #[serde(default)] + pub safekeeper_connstrings: Vec, + #[serde(default)] pub mode: ComputeMode, + /// If set, 'storage_auth_token' is used as the password to authenticate to + /// the pageserver and safekeepers. pub storage_auth_token: Option, } @@ -47,13 +71,19 @@ pub enum ComputeMode { Replica, } -#[derive(Clone, Debug, Default, Deserialize)] +#[derive(Clone, Debug, Default, Deserialize, Serialize)] pub struct Cluster { - pub cluster_id: String, - pub name: String, + pub cluster_id: Option, + pub name: Option, pub state: Option, pub roles: Vec, pub databases: Vec, + + /// Desired contents of 'postgresql.conf' file. (The 'compute_ctl' + /// tool may add additional settings to the final file.) + pub postgresql_conf: Option, + + /// Additional settings that will be appended to the 'postgresql.conf' file. pub settings: GenericOptions, } @@ -63,7 +93,7 @@ pub struct Cluster { /// - DROP ROLE /// - ALTER ROLE name RENAME TO new_name /// - ALTER DATABASE name RENAME TO new_name -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct DeltaOp { pub action: String, pub name: PgIdent, @@ -72,7 +102,7 @@ pub struct DeltaOp { /// Rust representation of Postgres role info with only those fields /// that matter for us. -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct Role { pub name: PgIdent, pub encrypted_password: Option, @@ -81,7 +111,7 @@ pub struct Role { /// Rust representation of Postgres database info with only those fields /// that matter for us. -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct Database { pub name: PgIdent, pub owner: PgIdent, @@ -91,7 +121,7 @@ pub struct Database { /// Common type representing both SQL statement params with or without value, /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config /// options like `wal_level = logical`. -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct GenericOption { pub name: String, pub value: Option, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 3d98fd63a8..162bf6b294 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -18,7 +18,29 @@ use crate::reltag::RelTag; use anyhow::bail; use bytes::{BufMut, Bytes, BytesMut}; -/// A state of a tenant in pageserver's memory. +/// The state of a tenant in this pageserver. +/// +/// ```mermaid +/// stateDiagram-v2 +/// +/// [*] --> Loading: spawn_load() +/// [*] --> Attaching: spawn_attach() +/// +/// Loading --> Activating: activate() +/// Attaching --> Activating: activate() +/// Activating --> Active: infallible +/// +/// Loading --> Broken: load() failure +/// Attaching --> Broken: attach() failure +/// +/// Active --> Stopping: set_stopping(), part of shutdown & detach +/// Stopping --> Broken: late error in remove_tenant_from_memory +/// +/// Broken --> [*]: ignore / detach / shutdown +/// Stopping --> [*]: remove_from_memory complete +/// +/// Active --> Broken: cfg(testing)-only tenant break point +/// ``` #[derive( Clone, PartialEq, @@ -26,43 +48,63 @@ use bytes::{BufMut, Bytes, BytesMut}; serde::Serialize, serde::Deserialize, strum_macros::Display, - strum_macros::EnumString, strum_macros::EnumVariantNames, strum_macros::AsRefStr, strum_macros::IntoStaticStr, )] #[serde(tag = "slug", content = "data")] pub enum TenantState { - /// This tenant is being loaded from local disk + /// This tenant is being loaded from local disk. + /// + /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass. Loading, - /// This tenant is being downloaded from cloud storage. + /// This tenant is being attached to the pageserver. + /// + /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass. Attaching, /// The tenant is transitioning from Loading/Attaching to Active. - Activating, - /// Tenant is fully operational + /// + /// While in this state, the individual timelines are being activated. + /// + /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass. + Activating(ActivatingFrom), + /// The tenant has finished activating and is open for business. + /// + /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`. Active, - /// A tenant is recognized by pageserver, but it is being detached or the + /// The tenant is recognized by pageserver, but it is being detached or the /// system is being shut down. + /// + /// Transitions out of this state are possible through `set_broken()`. Stopping, - /// A tenant is recognized by the pageserver, but can no longer be used for - /// any operations, because it failed to be activated. + /// The tenant is recognized by the pageserver, but can no longer be used for + /// any operations. + /// + /// If the tenant fails to load or attach, it will transition to this state + /// and it is guaranteed that no background tasks are running in its name. + /// + /// The other way to transition into this state is from `Stopping` state + /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens + /// if the cleanup future executed by `remove_tenant_from_memory()` fails. Broken { reason: String, backtrace: String }, } impl TenantState { pub fn attachment_status(&self) -> TenantAttachmentStatus { use TenantAttachmentStatus::*; + + // Below TenantState::Activating is used as "transient" or "transparent" state for + // attachment_status determining. match self { // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map. // So, technically, we can return Attached here. // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check. // But, our attach task might still be fetching the remote timelines, etc. // So, return `Maybe` while Attaching, making Console wait for the attach task to finish. - Self::Attaching => Maybe, + Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe, // tenant mgr startup distinguishes attaching from loading via marker file. // If it's loading, there is no attach marker file, i.e., attach had finished in the past. - Self::Loading => Attached, - Self::Activating => todo!(), + Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached, // We only reach Active after successful load / attach. // So, call atttachment status Attached. Self::Active => Attached, @@ -101,6 +143,15 @@ impl std::fmt::Debug for TenantState { } } +/// The only [`TenantState`] variants we could be `TenantState::Activating` from. +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum ActivatingFrom { + /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`] + Loading, + /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`] + Attaching, +} + /// A state of a timeline in pageserver's memory. #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum TimelineState { @@ -121,9 +172,8 @@ pub enum TimelineState { #[serde_as] #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { - #[serde(default)] - #[serde_as(as = "Option")] - pub new_timeline_id: Option, + #[serde_as(as = "DisplayFromStr")] + pub new_timeline_id: TimelineId, #[serde(default)] #[serde_as(as = "Option")] pub ancestor_timeline_id: Option, @@ -134,12 +184,11 @@ pub struct TimelineCreateRequest { } #[serde_as] -#[derive(Serialize, Deserialize, Debug, Default)] +#[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantCreateRequest { - #[serde(default)] - #[serde_as(as = "Option")] - pub new_tenant_id: Option, + #[serde_as(as = "DisplayFromStr")] + pub new_tenant_id: TenantId, #[serde(flatten)] pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it } @@ -174,6 +223,7 @@ pub struct TenantConfig { pub eviction_policy: Option, pub min_resident_size_override: Option, pub evictions_low_residence_duration_metric_threshold: Option, + pub gc_feedback: Option, } #[serde_as] @@ -187,10 +237,10 @@ pub struct StatusResponse { } impl TenantCreateRequest { - pub fn new(new_tenant_id: Option) -> TenantCreateRequest { + pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest { TenantCreateRequest { new_tenant_id, - ..Default::default() + config: TenantConfig::default(), } } } @@ -232,6 +282,7 @@ impl TenantConfigRequest { eviction_policy: None, min_resident_size_override: None, evictions_low_residence_duration_metric_threshold: None, + gc_feedback: None, }; TenantConfigRequest { tenant_id, config } } @@ -834,4 +885,55 @@ mod tests { err ); } + + #[test] + fn tenantstatus_activating_serde() { + let states = [ + TenantState::Activating(ActivatingFrom::Loading), + TenantState::Activating(ActivatingFrom::Attaching), + ]; + let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]"; + + let actual = serde_json::to_string(&states).unwrap(); + + assert_eq!(actual, expected); + + let parsed = serde_json::from_str::>(&actual).unwrap(); + + assert_eq!(states.as_slice(), &parsed); + } + + #[test] + fn tenantstatus_activating_strum() { + // tests added, because we use these for metrics + let examples = [ + (line!(), TenantState::Loading, "Loading"), + (line!(), TenantState::Attaching, "Attaching"), + ( + line!(), + TenantState::Activating(ActivatingFrom::Loading), + "Activating", + ), + ( + line!(), + TenantState::Activating(ActivatingFrom::Attaching), + "Activating", + ), + (line!(), TenantState::Active, "Active"), + (line!(), TenantState::Stopping, "Stopping"), + ( + line!(), + TenantState::Broken { + reason: "Example".into(), + backtrace: "Looooong backtrace".into(), + }, + "Broken", + ), + ]; + + for (line, rendered, expected) in examples { + let actual: &'static str = rendered.into(); + assert_eq!(actual, expected, "example on {line}"); + } + } } diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs new file mode 100644 index 0000000000..2cdaee548e --- /dev/null +++ b/libs/utils/src/completion.rs @@ -0,0 +1,33 @@ +use std::sync::Arc; + +use tokio::sync::{mpsc, Mutex}; + +/// While a reference is kept around, the associated [`Barrier::wait`] will wait. +/// +/// Can be cloned, moved and kept around in futures as "guard objects". +#[derive(Clone)] +pub struct Completion(mpsc::Sender<()>); + +/// Barrier will wait until all clones of [`Completion`] have been dropped. +#[derive(Clone)] +pub struct Barrier(Arc>>); + +impl Barrier { + pub async fn wait(self) { + self.0.lock().await.recv().await; + } + + pub async fn maybe_wait(barrier: Option) { + if let Some(b) = barrier { + b.wait().await + } + } +} + +/// Create new Guard and Barrier pair. +pub fn channel() -> (Completion, Barrier) { + let (tx, rx) = mpsc::channel::<()>(1); + let rx = Mutex::new(rx); + let rx = Arc::new(rx); + (Completion(tx), Barrier(rx)) +} diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 4bfb5bf994..7cb96d9094 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -1,5 +1,5 @@ use crate::auth::{Claims, JwtAuth}; -use crate::http::error; +use crate::http::error::{api_error_handler, route_error_handler, ApiError}; use anyhow::{anyhow, Context}; use hyper::header::{HeaderName, AUTHORIZATION}; use hyper::http::HeaderValue; @@ -16,8 +16,6 @@ use std::future::Future; use std::net::TcpListener; use std::str::FromStr; -use super::error::ApiError; - static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { register_int_counter!( "libmetrics_metric_handler_requests_total", @@ -35,8 +33,18 @@ struct RequestId(String); /// Adds a tracing info_span! instrumentation around the handler events, /// logs the request start and end events for non-GET requests and non-200 responses. /// +/// Usage: Replace `my_handler` with `|r| request_span(r, my_handler)` +/// /// Use this to distinguish between logs of different HTTP requests: every request handler wrapped -/// in this type will get request info logged in the wrapping span, including the unique request ID. +/// with this will get request info logged in the wrapping span, including the unique request ID. +/// +/// This also handles errors, logging them and converting them to an HTTP error response. +/// +/// NB: If the client disconnects, Hyper will drop the Future, without polling it to +/// completion. In other words, the handler must be async cancellation safe! request_span +/// prints a warning to the log when that happens, so that you have some trace of it in +/// the log. +/// /// /// There could be other ways to implement similar functionality: /// @@ -54,60 +62,56 @@ struct RequestId(String); /// tries to achive with its `.instrument` used in the current approach. /// /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced. -pub struct RequestSpan(pub H) +pub async fn request_span(request: Request, handler: H) -> R::Output where - E: Into> + 'static, - R: Future, E>> + Send + 'static, - H: Fn(Request) -> R + Send + Sync + 'static; - -impl RequestSpan -where - E: Into> + 'static, - R: Future, E>> + Send + 'static, - H: Fn(Request) -> R + Send + Sync + 'static, + R: Future, ApiError>> + Send + 'static, + H: FnOnce(Request) -> R + Send + Sync + 'static, { - /// Creates a tracing span around inner request handler and executes the request handler in the contex of that span. - /// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled. - pub async fn handle(self, request: Request) -> Result, E> { - let request_id = request.context::().unwrap_or_default().0; - let method = request.method(); - let path = request.uri().path(); - let request_span = info_span!("request", %method, %path, %request_id); + let request_id = request.context::().unwrap_or_default().0; + let method = request.method(); + let path = request.uri().path(); + let request_span = info_span!("request", %method, %path, %request_id); - let log_quietly = method == Method::GET; - async move { - let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding(); - if log_quietly { - debug!("Handling request"); - } else { - info!("Handling request"); - } - - // Note that we reuse `error::handler` here and not returning and error at all, - // yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation. - // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call. - // - // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally. - let res = (self.0)(request).await; - - cancellation_guard.disarm(); - - match res { - Ok(response) => { - let response_status = response.status(); - if log_quietly && response_status.is_success() { - debug!("Request handled, status: {response_status}"); - } else { - info!("Request handled, status: {response_status}"); - } - Ok(response) - } - Err(e) => Ok(error::handler(e.into()).await), - } + let log_quietly = method == Method::GET; + async move { + let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding(); + if log_quietly { + debug!("Handling request"); + } else { + info!("Handling request"); + } + + // No special handling for panics here. There's a `tracing_panic_hook` from another + // module to do that globally. + let res = handler(request).await; + + cancellation_guard.disarm(); + + // Log the result if needed. + // + // We also convert any errors into an Ok response with HTTP error code here. + // `make_router` sets a last-resort error handler that would do the same, but + // we prefer to do it here, before we exit the request span, so that the error + // is still logged with the span. + // + // (Because we convert errors to Ok response, we never actually return an error, + // and we could declare the function to return the never type (`!`). However, + // using `routerify::RouterBuilder` requires a proper error type.) + match res { + Ok(response) => { + let response_status = response.status(); + if log_quietly && response_status.is_success() { + debug!("Request handled, status: {response_status}"); + } else { + info!("Request handled, status: {response_status}"); + } + Ok(response) + } + Err(err) => Ok(api_error_handler(err)), } - .instrument(request_span) - .await } + .instrument(request_span) + .await } /// Drop guard to WARN in case the request was dropped before completion. @@ -207,10 +211,8 @@ pub fn make_router() -> RouterBuilder { .middleware(Middleware::post_with_info( add_request_id_header_to_response, )) - .get("/metrics", |r| { - RequestSpan(prometheus_metrics_handler).handle(r) - }) - .err_handler(error::handler) + .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .err_handler(route_error_handler) } pub fn attach_openapi_ui( @@ -220,12 +222,14 @@ pub fn attach_openapi_ui( ui_mount_path: &'static str, ) -> RouterBuilder { router_builder - .get(spec_mount_path, move |r| { - RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) }) - .handle(r) - }) - .get(ui_mount_path, move |r| RequestSpan( move |_| async move { - Ok(Response::builder().body(Body::from(format!(r#" + .get(spec_mount_path, + move |r| request_span(r, move |_| async move { + Ok(Response::builder().body(Body::from(spec)).unwrap()) + }) + ) + .get(ui_mount_path, + move |r| request_span(r, move |_| async move { + Ok(Response::builder().body(Body::from(format!(r#" @@ -255,7 +259,8 @@ pub fn attach_openapi_ui( "#, spec_mount_path))).unwrap()) - }).handle(r)) + }) + ) } fn parse_token(header_value: &str) -> Result<&str, ApiError> { diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index 3c6023eb80..4eff16b6a3 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -83,13 +83,24 @@ impl HttpErrorBody { } } -pub async fn handler(err: routerify::RouteError) -> Response { - let api_error = err - .downcast::() - .expect("handler should always return api error"); +pub async fn route_error_handler(err: routerify::RouteError) -> Response { + match err.downcast::() { + Ok(api_error) => api_error_handler(*api_error), + Err(other_error) => { + // We expect all the request handlers to return an ApiError, so this should + // not be reached. But just in case. + error!("Error processing HTTP request: {other_error:?}"); + HttpErrorBody::response_from_msg_and_status( + other_error.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ) + } + } +} +pub fn api_error_handler(api_error: ApiError) -> Response { // Print a stack trace for Internal Server errors - if let ApiError::InternalServerError(_) = api_error.as_ref() { + if let ApiError::InternalServerError(_) = api_error { error!("Error processing HTTP request: {api_error:?}"); } else { error!("Error processing HTTP request: {api_error:#}"); diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 4e4f79ab6b..69d3a1b9f2 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -60,6 +60,9 @@ pub mod tracing_span_assert; pub mod rate_limit; +/// Simple once-barrier and a guard which keeps barrier awaiting. +pub mod completion; + mod failpoint_macro_helpers { /// use with fail::cfg("$name", "return(2000)") diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml new file mode 100644 index 0000000000..89e0d0486e --- /dev/null +++ b/pageserver/ctl/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "pagectl" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow.workspace = true +bytes.workspace = true +clap = { workspace = true, features = ["string"] } +git-version.workspace = true +pageserver = { path = ".." } +postgres_ffi.workspace = true +utils.workspace = true +svg_fmt.workspace = true +workspace_hack.workspace = true diff --git a/pageserver/src/bin/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs similarity index 97% rename from pageserver/src/bin/draw_timeline_dir.rs rename to pageserver/ctl/src/draw_timeline_dir.rs index da13ee452c..bfde5ba054 100644 --- a/pageserver/src/bin/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -12,7 +12,7 @@ //! Example use: //! ``` //! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ -//! $ grep "__" | cargo run --release --bin draw_timeline_dir > out.svg +//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg //! $ firefox out.svg //! ``` //! @@ -62,7 +62,7 @@ fn parse_filename(name: &str) -> (Range, Range) { (keys, lsns) } -fn main() -> Result<()> { +pub fn main() -> Result<()> { // Parse layer filenames from stdin let mut ranges: Vec<(Range, Range)> = vec![]; let stdin = io::stdin(); diff --git a/pageserver/src/bin/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs similarity index 92% rename from pageserver/src/bin/layer_map_analyzer.rs rename to pageserver/ctl/src/layer_map_analyzer.rs index e740879458..f2ced6154f 100644 --- a/pageserver/src/bin/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -6,7 +6,7 @@ use anyhow::Result; use std::cmp::Ordering; use std::collections::BinaryHeap; use std::ops::Range; -use std::{env, fs, path::Path, path::PathBuf, str, str::FromStr}; +use std::{fs, path::Path, str}; use pageserver::page_cache::PAGE_SZ; use pageserver::repository::{Key, KEY_SIZE}; @@ -18,12 +18,14 @@ use pageserver::virtual_file::VirtualFile; use utils::{bin_ser::BeSer, lsn::Lsn}; +use crate::AnalyzeLayerMapCmd; + const MIN_HOLE_LENGTH: i128 = (128 * 1024 * 1024 / PAGE_SZ) as i128; const DEFAULT_MAX_HOLES: usize = 10; /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap #[derive(PartialEq, Eq)] -struct Hole(Range); +pub struct Hole(Range); impl Ord for Hole { fn cmp(&self, other: &Self) -> Ordering { @@ -39,11 +41,11 @@ impl PartialOrd for Hole { } } -struct LayerFile { - key_range: Range, - lsn_range: Range, - is_delta: bool, - holes: Vec, +pub(crate) struct LayerFile { + pub key_range: Range, + pub lsn_range: Range, + pub is_delta: bool, + pub holes: Vec, } impl LayerFile { @@ -67,7 +69,7 @@ impl LayerFile { } } -fn parse_filename(name: &str) -> Option { +pub(crate) fn parse_filename(name: &str) -> Option { let split: Vec<&str> = name.split("__").collect(); if split.len() != 2 { return None; @@ -127,18 +129,9 @@ fn get_holes(path: &Path, max_holes: usize) -> Result> { Ok(holes) } -fn main() -> Result<()> { - let args: Vec = env::args().collect(); - if args.len() < 2 { - println!("Usage: layer_map_analyzer PAGESERVER_DATA_DIR [MAX_HOLES]"); - return Ok(()); - } - let storage_path = PathBuf::from_str(&args[1])?; - let max_holes = if args.len() > 2 { - args[2].parse::().unwrap() - } else { - DEFAULT_MAX_HOLES - }; +pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { + let storage_path = &cmd.path; + let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. pageserver::virtual_file::init(10); diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs new file mode 100644 index 0000000000..d77cf0908c --- /dev/null +++ b/pageserver/ctl/src/layers.rs @@ -0,0 +1,169 @@ +use std::path::{Path, PathBuf}; + +use anyhow::Result; +use clap::Subcommand; +use pageserver::tenant::block_io::BlockCursor; +use pageserver::tenant::disk_btree::DiskBtreeReader; +use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary}; +use pageserver::{page_cache, virtual_file}; +use pageserver::{ + repository::{Key, KEY_SIZE}, + tenant::{ + block_io::FileBlockReader, disk_btree::VisitDirection, + storage_layer::delta_layer::DELTA_KEY_SIZE, + }, + virtual_file::VirtualFile, +}; +use std::fs; +use utils::bin_ser::BeSer; + +use crate::layer_map_analyzer::parse_filename; + +#[derive(Subcommand)] +pub(crate) enum LayerCmd { + /// List all tenants and timelines under the pageserver path + /// + /// Example: `cargo run --bin pagectl layer list .neon/` + List { path: PathBuf }, + /// List all layers of a given tenant and timeline + /// + /// Example: `cargo run --bin pagectl layer list .neon/` + ListLayer { + path: PathBuf, + tenant: String, + timeline: String, + }, + /// Dump all information of a layer file + DumpLayer { + path: PathBuf, + tenant: String, + timeline: String, + /// The id from list-layer command + id: usize, + }, +} + +fn read_delta_file(path: impl AsRef) -> Result<()> { + use pageserver::tenant::blob_io::BlobCursor; + use pageserver::tenant::block_io::BlockReader; + + let path = path.as_ref(); + virtual_file::init(10); + page_cache::init(100); + let file = FileBlockReader::new(VirtualFile::open(path)?); + let summary_blk = file.read_blk(0)?; + let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + actual_summary.index_start_blk, + actual_summary.index_root_blk, + &file, + ); + // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API. + let mut all = vec![]; + tree_reader.visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |key, value_offset| { + let curr = Key::from_slice(&key[..KEY_SIZE]); + all.push((curr, BlobRef(value_offset))); + true + }, + )?; + let mut cursor = BlockCursor::new(&file); + for (k, v) in all { + let value = cursor.read_blob(v.pos())?; + println!("key:{} value_len:{}", k, value.len()); + } + // TODO(chi): special handling for last key? + Ok(()) +} + +pub(crate) fn main(cmd: &LayerCmd) -> Result<()> { + match cmd { + LayerCmd::List { path } => { + for tenant in fs::read_dir(path.join("tenants"))? { + let tenant = tenant?; + if !tenant.file_type()?.is_dir() { + continue; + } + println!("tenant {}", tenant.file_name().to_string_lossy()); + for timeline in fs::read_dir(tenant.path().join("timelines"))? { + let timeline = timeline?; + if !timeline.file_type()?.is_dir() { + continue; + } + println!("- timeline {}", timeline.file_name().to_string_lossy()); + } + } + } + LayerCmd::ListLayer { + path, + tenant, + timeline, + } => { + let timeline_path = path + .join("tenants") + .join(tenant) + .join("timelines") + .join(timeline); + let mut idx = 0; + for layer in fs::read_dir(timeline_path)? { + let layer = layer?; + if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) + { + println!( + "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", + idx, + layer_file.key_range.start, + layer_file.key_range.end, + layer_file.lsn_range.start, + layer_file.lsn_range.end, + layer_file.is_delta, + ); + idx += 1; + } + } + } + LayerCmd::DumpLayer { + path, + tenant, + timeline, + id, + } => { + let timeline_path = path + .join("tenants") + .join(tenant) + .join("timelines") + .join(timeline); + let mut idx = 0; + for layer in fs::read_dir(timeline_path)? { + let layer = layer?; + if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) + { + if *id == idx { + // TODO(chi): dedup code + println!( + "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", + idx, + layer_file.key_range.start, + layer_file.key_range.end, + layer_file.lsn_range.start, + layer_file.lsn_range.end, + layer_file.is_delta, + ); + + if layer_file.is_delta { + read_delta_file(layer.path())?; + } else { + anyhow::bail!("not supported yet :("); + } + + break; + } + idx += 1; + } + } + } + } + Ok(()) +} diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs new file mode 100644 index 0000000000..55db9eb7e7 --- /dev/null +++ b/pageserver/ctl/src/main.rs @@ -0,0 +1,179 @@ +//! A helper tool to manage pageserver binary files. +//! Accepts a file as an argument, attempts to parse it with all ways possible +//! and prints its interpreted context. +//! +//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file. + +mod draw_timeline_dir; +mod layer_map_analyzer; +mod layers; + +use clap::{Parser, Subcommand}; +use layers::LayerCmd; +use pageserver::{ + context::{DownloadBehavior, RequestContext}, + page_cache, + task_mgr::TaskKind, + tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, + virtual_file, +}; +use postgres_ffi::ControlFileData; +use std::path::{Path, PathBuf}; +use utils::{lsn::Lsn, project_git_version}; + +project_git_version!(GIT_VERSION); + +#[derive(Parser)] +#[command( + version = GIT_VERSION, + about = "Neon Pageserver binutils", + long_about = "Reads pageserver (and related) binary files management utility" +)] +#[command(propagate_version = true)] +struct CliOpts { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + Metadata(MetadataCmd), + PrintLayerFile(PrintLayerFileCmd), + DrawTimeline {}, + AnalyzeLayerMap(AnalyzeLayerMapCmd), + #[command(subcommand)] + Layer(LayerCmd), +} + +/// Read and update pageserver metadata file +#[derive(Parser)] +struct MetadataCmd { + /// Input metadata file path + metadata_path: PathBuf, + /// Replace disk consistent Lsn + disk_consistent_lsn: Option, + /// Replace previous record Lsn + prev_record_lsn: Option, + /// Replace latest gc cuttoff + latest_gc_cuttoff: Option, +} + +#[derive(Parser)] +struct PrintLayerFileCmd { + /// Pageserver data path + path: PathBuf, +} + +#[derive(Parser)] +struct AnalyzeLayerMapCmd { + /// Pageserver data path + path: PathBuf, + /// Max holes + max_holes: Option, +} + +fn main() -> anyhow::Result<()> { + let cli = CliOpts::parse(); + + match cli.command { + Commands::Layer(cmd) => { + layers::main(&cmd)?; + } + Commands::Metadata(cmd) => { + handle_metadata(&cmd)?; + } + Commands::DrawTimeline {} => { + draw_timeline_dir::main()?; + } + Commands::AnalyzeLayerMap(cmd) => { + layer_map_analyzer::main(&cmd)?; + } + Commands::PrintLayerFile(cmd) => { + if let Err(e) = read_pg_control_file(&cmd.path) { + println!( + "Failed to read input file as a pg control one: {e:#}\n\ + Attempting to read it as layer file" + ); + print_layerfile(&cmd.path)?; + } + } + }; + Ok(()) +} + +fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> { + let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?; + println!("{control_file:?}"); + let control_file_initdb = Lsn(control_file.checkPoint); + println!( + "pg_initdb_lsn: {}, aligned: {}", + control_file_initdb, + control_file_initdb.align() + ); + Ok(()) +} + +fn print_layerfile(path: &Path) -> anyhow::Result<()> { + // Basic initialization of things that don't change after startup + virtual_file::init(10); + page_cache::init(100); + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + dump_layerfile_from_path(path, true, &ctx) +} + +fn handle_metadata( + MetadataCmd { + metadata_path: path, + disk_consistent_lsn, + prev_record_lsn, + latest_gc_cuttoff, + }: &MetadataCmd, +) -> Result<(), anyhow::Error> { + let metadata_bytes = std::fs::read(path)?; + let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; + println!("Current metadata:\n{meta:?}"); + let mut update_meta = false; + if let Some(disk_consistent_lsn) = disk_consistent_lsn { + meta = TimelineMetadata::new( + *disk_consistent_lsn, + meta.prev_record_lsn(), + meta.ancestor_timeline(), + meta.ancestor_lsn(), + meta.latest_gc_cutoff_lsn(), + meta.initdb_lsn(), + meta.pg_version(), + ); + update_meta = true; + } + if let Some(prev_record_lsn) = prev_record_lsn { + meta = TimelineMetadata::new( + meta.disk_consistent_lsn(), + Some(*prev_record_lsn), + meta.ancestor_timeline(), + meta.ancestor_lsn(), + meta.latest_gc_cutoff_lsn(), + meta.initdb_lsn(), + meta.pg_version(), + ); + update_meta = true; + } + if let Some(latest_gc_cuttoff) = latest_gc_cuttoff { + meta = TimelineMetadata::new( + meta.disk_consistent_lsn(), + meta.prev_record_lsn(), + meta.ancestor_timeline(), + meta.ancestor_lsn(), + *latest_gc_cuttoff, + meta.initdb_lsn(), + meta.pg_version(), + ); + update_meta = true; + } + + if update_meta { + let metadata_bytes = meta.to_bytes()?; + std::fs::write(path, metadata_bytes)?; + } + + Ok(()) +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 9da3a519a2..1fa5e4ab3b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -275,6 +275,7 @@ fn start_pageserver( let pageserver_listener = tcp_listener::bind(pg_addr)?; // Launch broker client + // The storage_broker::connect call needs to happen inside a tokio runtime thread. let broker_client = WALRECEIVER_RUNTIME .block_on(async { // Note: we do not attempt connecting here (but validate endpoints sanity). @@ -334,13 +335,119 @@ fn start_pageserver( // Set up remote storage client let remote_storage = create_remote_storage_client(conf)?; + // Startup staging or optimizing: + // + // We want to minimize downtime for `page_service` connections, and trying not to overload + // BACKGROUND_RUNTIME by doing initial compactions and initial logical sizes at the same time. + // + // init_done_rx will notify when all initial load operations have completed. + // + // background_jobs_can_start (same name used to hold off background jobs from starting at + // consumer side) will be dropped once we can start the background jobs. Currently it is behind + // completing all initial logical size calculations (init_logical_size_done_rx) and a timeout + // (background_task_maximum_delay). + let (init_done_tx, init_done_rx) = utils::completion::channel(); + + let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel(); + + let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel(); + + let order = pageserver::InitializationOrder { + initial_tenant_load: Some(init_done_tx), + initial_logical_size_can_start: init_done_rx.clone(), + initial_logical_size_attempt: init_logical_size_done_tx, + background_jobs_can_start: background_jobs_barrier.clone(), + }; + // Scan the local 'tenants/' directory and start loading the tenants + let init_started_at = std::time::Instant::now(); + let shutdown_pageserver = tokio_util::sync::CancellationToken::new(); + BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( conf, broker_client.clone(), remote_storage.clone(), + order, ))?; + BACKGROUND_RUNTIME.spawn({ + let init_done_rx = init_done_rx; + let shutdown_pageserver = shutdown_pageserver.clone(); + let drive_init = async move { + // NOTE: unlike many futures in pageserver, this one is cancellation-safe + let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed")); + + init_done_rx.wait().await; + // initial logical sizes can now start, as they were waiting on init_done_rx. + + scopeguard::ScopeGuard::into_inner(guard); + + let init_done = std::time::Instant::now(); + let elapsed = init_done - init_started_at; + + tracing::info!( + elapsed_millis = elapsed.as_millis(), + "Initial load completed" + ); + + let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait()); + + let timeout = conf.background_task_maximum_delay; + + let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed")); + + let init_sizes_done = tokio::select! { + _ = &mut init_sizes_done => { + let now = std::time::Instant::now(); + tracing::info!( + from_init_done_millis = (now - init_done).as_millis(), + from_init_millis = (now - init_started_at).as_millis(), + "Initial logical sizes completed" + ); + None + } + _ = tokio::time::sleep(timeout) => { + tracing::info!( + timeout_millis = timeout.as_millis(), + "Initial logical size timeout elapsed; starting background jobs" + ); + Some(init_sizes_done) + } + }; + + scopeguard::ScopeGuard::into_inner(guard); + + // allow background jobs to start + drop(background_jobs_can_start); + + if let Some(init_sizes_done) = init_sizes_done { + // ending up here is not a bug; at the latest logical sizes will be queried by + // consumption metrics. + let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed")); + init_sizes_done.await; + + scopeguard::ScopeGuard::into_inner(guard); + + let now = std::time::Instant::now(); + tracing::info!( + from_init_done_millis = (now - init_done).as_millis(), + from_init_millis = (now - init_started_at).as_millis(), + "Initial logical sizes completed after timeout (background jobs already started)" + ); + + } + }; + + async move { + let mut drive_init = std::pin::pin!(drive_init); + // just race these tasks + tokio::select! { + _ = shutdown_pageserver.cancelled() => {}, + _ = &mut drive_init => {}, + } + } + }); + // shared state between the disk-usage backed eviction background task and the http endpoint // that allows triggering disk-usage based eviction manually. note that the http endpoint // is still accessible even if background task is not configured as long as remote storage has @@ -352,6 +459,7 @@ fn start_pageserver( conf, remote_storage.clone(), disk_usage_eviction_state.clone(), + background_jobs_barrier.clone(), )?; } @@ -389,6 +497,7 @@ fn start_pageserver( ); if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint { + let background_jobs_barrier = background_jobs_barrier; let metrics_ctx = RequestContext::todo_child( TaskKind::MetricsCollection, // This task itself shouldn't download anything. @@ -404,6 +513,18 @@ fn start_pageserver( "consumption metrics collection", true, async move { + // first wait until background jobs are cleared to launch. + // + // this is because we only process active tenants and timelines, and the + // Timeline::get_current_logical_size will spawn the logical size calculation, + // which will not be rate-limited. + let cancel = task_mgr::shutdown_token(); + + tokio::select! { + _ = cancel.cancelled() => { return Ok(()); }, + _ = background_jobs_barrier.wait() => {} + }; + pageserver::consumption_metrics::collect_metrics( metric_collection_endpoint, conf.metric_collection_interval, @@ -452,6 +573,8 @@ fn start_pageserver( ); } + let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); + // All started up! Now just sit and wait for shutdown signal. ShutdownSignals::handle(|signal| match signal { Signal::Quit => { @@ -467,6 +590,11 @@ fn start_pageserver( "Got {}. Terminating gracefully in fast shutdown mode", signal.name() ); + + // This cancels the `shutdown_pageserver` cancellation tree. + // Right now that tree doesn't reach very far, and `task_mgr` is used instead. + // The plan is to change that over time. + shutdown_pageserver.take(); BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0)); unreachable!() } diff --git a/pageserver/src/bin/pageserver_binutils.rs b/pageserver/src/bin/pageserver_binutils.rs deleted file mode 100644 index 5e2d39d685..0000000000 --- a/pageserver/src/bin/pageserver_binutils.rs +++ /dev/null @@ -1,174 +0,0 @@ -//! A helper tool to manage pageserver binary files. -//! Accepts a file as an argument, attempts to parse it with all ways possible -//! and prints its interpreted context. -//! -//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file. -use std::{ - path::{Path, PathBuf}, - str::FromStr, -}; - -use anyhow::Context; -use clap::{value_parser, Arg, Command}; - -use pageserver::{ - context::{DownloadBehavior, RequestContext}, - page_cache, - task_mgr::TaskKind, - tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, - virtual_file, -}; -use postgres_ffi::ControlFileData; -use utils::{lsn::Lsn, project_git_version}; - -project_git_version!(GIT_VERSION); - -const METADATA_SUBCOMMAND: &str = "metadata"; - -fn main() -> anyhow::Result<()> { - let arg_matches = cli().get_matches(); - - match arg_matches.subcommand() { - Some((subcommand_name, subcommand_matches)) => { - let path = subcommand_matches - .get_one::("metadata_path") - .context("'metadata_path' argument is missing")? - .to_path_buf(); - anyhow::ensure!( - subcommand_name == METADATA_SUBCOMMAND, - "Unknown subcommand {subcommand_name}" - ); - handle_metadata(&path, subcommand_matches)?; - } - None => { - let path = arg_matches - .get_one::("path") - .context("'path' argument is missing")? - .to_path_buf(); - println!( - "No subcommand specified, attempting to guess the format for file {}", - path.display() - ); - if let Err(e) = read_pg_control_file(&path) { - println!( - "Failed to read input file as a pg control one: {e:#}\n\ - Attempting to read it as layer file" - ); - print_layerfile(&path)?; - } - } - }; - Ok(()) -} - -fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> { - let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?; - println!("{control_file:?}"); - let control_file_initdb = Lsn(control_file.checkPoint); - println!( - "pg_initdb_lsn: {}, aligned: {}", - control_file_initdb, - control_file_initdb.align() - ); - Ok(()) -} - -fn print_layerfile(path: &Path) -> anyhow::Result<()> { - // Basic initialization of things that don't change after startup - virtual_file::init(10); - page_cache::init(100); - let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); - dump_layerfile_from_path(path, true, &ctx) -} - -fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> { - let metadata_bytes = std::fs::read(path)?; - let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; - println!("Current metadata:\n{meta:?}"); - let mut update_meta = false; - if let Some(disk_consistent_lsn) = arg_matches.get_one::("disk_consistent_lsn") { - meta = TimelineMetadata::new( - Lsn::from_str(disk_consistent_lsn)?, - meta.prev_record_lsn(), - meta.ancestor_timeline(), - meta.ancestor_lsn(), - meta.latest_gc_cutoff_lsn(), - meta.initdb_lsn(), - meta.pg_version(), - ); - update_meta = true; - } - if let Some(prev_record_lsn) = arg_matches.get_one::("prev_record_lsn") { - meta = TimelineMetadata::new( - meta.disk_consistent_lsn(), - Some(Lsn::from_str(prev_record_lsn)?), - meta.ancestor_timeline(), - meta.ancestor_lsn(), - meta.latest_gc_cutoff_lsn(), - meta.initdb_lsn(), - meta.pg_version(), - ); - update_meta = true; - } - if let Some(latest_gc_cuttoff) = arg_matches.get_one::("latest_gc_cuttoff") { - meta = TimelineMetadata::new( - meta.disk_consistent_lsn(), - meta.prev_record_lsn(), - meta.ancestor_timeline(), - meta.ancestor_lsn(), - Lsn::from_str(latest_gc_cuttoff)?, - meta.initdb_lsn(), - meta.pg_version(), - ); - update_meta = true; - } - - if update_meta { - let metadata_bytes = meta.to_bytes()?; - std::fs::write(path, metadata_bytes)?; - } - - Ok(()) -} - -fn cli() -> Command { - Command::new("Neon Pageserver binutils") - .about("Reads pageserver (and related) binary files management utility") - .version(GIT_VERSION) - .arg( - Arg::new("path") - .help("Input file path") - .value_parser(value_parser!(PathBuf)) - .required(false), - ) - .subcommand( - Command::new(METADATA_SUBCOMMAND) - .about("Read and update pageserver metadata file") - .arg( - Arg::new("metadata_path") - .help("Input metadata file path") - .value_parser(value_parser!(PathBuf)) - .required(false), - ) - .arg( - Arg::new("disk_consistent_lsn") - .long("disk_consistent_lsn") - .help("Replace disk consistent Lsn"), - ) - .arg( - Arg::new("prev_record_lsn") - .long("prev_record_lsn") - .help("Replace previous record Lsn"), - ) - .arg( - Arg::new("latest_gc_cuttoff") - .long("latest_gc_cuttoff") - .help("Replace latest gc cuttoff"), - ), - ) -} - -#[test] -fn verify_cli() { - cli().debug_assert(); -} diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 88a7f15b21..17e6e3fb2a 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -63,6 +63,7 @@ pub mod defaults { pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour"; pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; + pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; /// /// Default built-in configuration file. @@ -91,9 +92,10 @@ pub mod defaults { #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}' #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' - #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} +#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}' + # [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -108,7 +110,7 @@ pub mod defaults { #min_resident_size_override = .. # in bytes #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' - +#gc_feedback = false # [remote_storage] "### @@ -187,6 +189,15 @@ pub struct PageServerConf { pub test_remote_failures: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, + + /// How long will background tasks be delayed at most after initial load of tenants. + /// + /// Our largest initialization completions are in the range of 100-200s, so perhaps 10s works + /// as we now isolate initial loading, initial logical size calculation and background tasks. + /// Smaller nodes will have background tasks "not running" for this long unless every timeline + /// has it's initial logical size calculated. Not running background tasks for some seconds is + /// not terrible. + pub background_task_maximum_delay: Duration, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -259,6 +270,8 @@ struct PageServerConfigBuilder { test_remote_failures: BuilderValue, ondemand_download_behavior_treat_error_as_warn: BuilderValue, + + background_task_maximum_delay: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -316,6 +329,11 @@ impl Default for PageServerConfigBuilder { test_remote_failures: Set(0), ondemand_download_behavior_treat_error_as_warn: Set(false), + + background_task_maximum_delay: Set(humantime::parse_duration( + DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY, + ) + .unwrap()), } } } @@ -440,6 +458,10 @@ impl PageServerConfigBuilder { BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn); } + pub fn background_task_maximum_delay(&mut self, delay: Duration) { + self.background_task_maximum_delay = BuilderValue::Set(delay); + } + pub fn build(self) -> anyhow::Result { let concurrent_tenant_size_logical_size_queries = self .concurrent_tenant_size_logical_size_queries @@ -522,6 +544,9 @@ impl PageServerConfigBuilder { .ok_or(anyhow!( "missing ondemand_download_behavior_treat_error_as_warn" ))?, + background_task_maximum_delay: self + .background_task_maximum_delay + .ok_or(anyhow!("missing background_task_maximum_delay"))?, }) } } @@ -710,6 +735,7 @@ impl PageServerConf { ) }, "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), + "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -828,6 +854,14 @@ impl PageServerConf { )?); } + if let Some(gc_feedback) = item.get("gc_feedback") { + t_conf.gc_feedback = Some( + gc_feedback + .as_bool() + .with_context(|| "configure option gc_feedback is not a bool".to_string())?, + ); + } + Ok(t_conf) } @@ -869,6 +903,7 @@ impl PageServerConf { disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, + background_task_maximum_delay: Duration::ZERO, } } } @@ -1028,6 +1063,7 @@ metric_collection_endpoint = 'http://localhost:80/metrics' synthetic_size_calculation_interval = '333 s' log_format = 'json' +background_task_maximum_delay = '334 s' "#; @@ -1086,6 +1122,9 @@ log_format = 'json' disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, + background_task_maximum_delay: humantime::parse_duration( + defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY + )?, }, "Correct defaults should be used when no config values are provided" ); @@ -1140,6 +1179,7 @@ log_format = 'json' disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, + background_task_maximum_delay: Duration::from_secs(334), }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index e826d28e6d..f53b7736ab 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -88,6 +88,7 @@ use crate::task_mgr::TaskKind; // The main structure of this module, see module-level comment. +#[derive(Clone, Debug)] pub struct RequestContext { task_kind: TaskKind, download_behavior: DownloadBehavior, @@ -95,7 +96,7 @@ pub struct RequestContext { /// Desired behavior if the operation requires an on-demand download /// to proceed. -#[derive(Clone, Copy, PartialEq, Eq)] +#[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum DownloadBehavior { /// Download the layer file. It can take a while. Download, diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 9e5f644759..7869d019b1 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -54,6 +54,7 @@ use serde::{Deserialize, Serialize}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn, Instrument}; +use utils::completion; use utils::serde_percent::Percent; use crate::{ @@ -82,6 +83,7 @@ pub fn launch_disk_usage_global_eviction_task( conf: &'static PageServerConf, storage: GenericRemoteStorage, state: Arc, + background_jobs_barrier: completion::Barrier, ) -> anyhow::Result<()> { let Some(task_config) = &conf.disk_usage_based_eviction else { info!("disk usage based eviction task not configured"); @@ -98,14 +100,16 @@ pub fn launch_disk_usage_global_eviction_task( "disk usage based eviction", false, async move { - disk_usage_eviction_task( - &state, - task_config, - storage, - &conf.tenants_path(), - task_mgr::shutdown_token(), - ) - .await; + let cancel = task_mgr::shutdown_token(); + + // wait until initial load is complete, because we cannot evict from loading tenants. + tokio::select! { + _ = cancel.cancelled() => { return Ok(()); }, + _ = background_jobs_barrier.wait() => { } + }; + + disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel) + .await; info!("disk usage based eviction task finishing"); Ok(()) }, diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index e23d3f3a20..0d912c95e0 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -678,6 +678,8 @@ paths: application/json: schema: type: object + required: + - new_timeline_id properties: new_timeline_id: type: string @@ -936,6 +938,8 @@ components: allOf: - $ref: '#/components/schemas/TenantConfig' - type: object + required: + - new_tenant_id properties: new_tenant_id: type: string diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2e9f230ace..280e5327fc 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,3 +1,6 @@ +//! +//! Management HTTP API +//! use std::collections::HashMap; use std::sync::Arc; @@ -11,7 +14,7 @@ use storage_broker::BrokerClientChannel; use tenant_size_model::{SizeResult, StorageModel}; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::http::endpoint::RequestSpan; +use utils::http::endpoint::request_span; use utils::http::json::json_request_or_empty_body; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; @@ -25,7 +28,9 @@ use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; -use crate::tenant::mgr::{TenantMapInsertError, TenantStateError}; +use crate::tenant::mgr::{ + GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError, +}; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; @@ -44,7 +49,6 @@ use utils::{ }; // Imports only used for testing APIs -#[cfg(feature = "testing")] use super::models::ConfigureFailpointsRequest; struct State { @@ -144,6 +148,36 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(tse: GetTenantError) -> ApiError { + match tse { + GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)), + e @ GetTenantError::NotActive(_) => { + // Why is this not `ApiError::NotFound`? + // Because we must be careful to never return 404 for a tenant if it does + // in fact exist locally. If we did, the caller could draw the conclusion + // that it can attach the tenant to another PS and we'd be in split-brain. + // + // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls). + ApiError::InternalServerError(anyhow::Error::new(e)) + } + } + } +} + +impl From for ApiError { + fn from(e: SetNewTenantConfigError) -> ApiError { + match e { + SetNewTenantConfigError::GetTenant(tid) => { + ApiError::NotFound(anyhow!("tenant {}", tid)) + } + e @ SetNewTenantConfigError::Persist(_) => { + ApiError::InternalServerError(anyhow::Error::new(e)) + } + } + } +} + impl From for ApiError { fn from(value: crate::tenant::DeleteTimelineError) -> Self { use crate::tenant::DeleteTimelineError::*; @@ -163,7 +197,7 @@ impl From for ApiError { match value { // Report Precondition failed so client can distinguish between // "tenant is missing" case from "timeline is missing" - Tenant(TenantStateError::NotFound(..)) => { + Tenant(GetTenantError::NotFound(..)) => { ApiError::PreconditionFailed("Requested tenant is missing") } Tenant(t) => ApiError::from(t), @@ -258,20 +292,24 @@ async fn build_timeline_info_common( } // healthcheck handler -async fn status_handler(request: Request) -> Result, ApiError> { +async fn status_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { check_permission(&request, None)?; let config = get_config(&request); json_response(StatusCode::OK, StatusResponse { id: config.id }) } -async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { +async fn timeline_create_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let request_data: TimelineCreateRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_id))?; - let new_timeline_id = request_data - .new_timeline_id - .unwrap_or_else(TimelineId::generate); + let new_timeline_id = request_data.new_timeline_id; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error); @@ -299,11 +337,14 @@ async fn timeline_create_handler(mut request: Request) -> Result Err(ApiError::InternalServerError(err)), } } - .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) + .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) .await } -async fn timeline_list_handler(request: Request) -> Result, ApiError> { +async fn timeline_list_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; @@ -337,7 +378,10 @@ async fn timeline_list_handler(request: Request) -> Result, json_response(StatusCode::OK, response_data) } -async fn timeline_detail_handler(request: Request) -> Result, ApiError> { +async fn timeline_detail_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let include_non_incremental_logical_size: Option = @@ -372,7 +416,10 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result, ApiError> { +async fn get_lsn_by_timestamp_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -396,7 +443,10 @@ async fn get_lsn_by_timestamp_handler(request: Request) -> Result) -> Result, ApiError> { +async fn tenant_attach_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -432,7 +482,10 @@ async fn tenant_attach_handler(mut request: Request) -> Result) -> Result, ApiError> { +async fn timeline_delete_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; @@ -446,7 +499,10 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, ApiError> { +async fn tenant_detach_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let detach_ignored: Option = parse_query_param(&request, "detach_ignored")?; @@ -460,7 +516,10 @@ async fn tenant_detach_handler(request: Request) -> Result, json_response(StatusCode::OK, ()) } -async fn tenant_load_handler(request: Request) -> Result, ApiError> { +async fn tenant_load_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -480,7 +539,10 @@ async fn tenant_load_handler(request: Request) -> Result, A json_response(StatusCode::ACCEPTED, ()) } -async fn tenant_ignore_handler(request: Request) -> Result, ApiError> { +async fn tenant_ignore_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -493,7 +555,10 @@ async fn tenant_ignore_handler(request: Request) -> Result, json_response(StatusCode::OK, ()) } -async fn tenant_list_handler(request: Request) -> Result, ApiError> { +async fn tenant_list_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { check_permission(&request, None)?; let response_data = mgr::list_tenants() @@ -513,7 +578,10 @@ async fn tenant_list_handler(request: Request) -> Result, A json_response(StatusCode::OK, response_data) } -async fn tenant_status(request: Request) -> Result, ApiError> { +async fn tenant_status( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -527,7 +595,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro } let state = tenant.current_state(); - Ok(TenantInfo { + Result::<_, ApiError>::Ok(TenantInfo { id: tenant_id, state: state.clone(), current_physical_size: Some(current_physical_size), @@ -535,8 +603,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro }) } .instrument(info_span!("tenant_status_handler", tenant = %tenant_id)) - .await - .map_err(ApiError::InternalServerError)?; + .await?; json_response(StatusCode::OK, tenant_info) } @@ -554,7 +621,10 @@ async fn tenant_status(request: Request) -> Result, ApiErro /// Note: we don't update the cached size and prometheus metric here. /// The retention period might be different, and it's nice to have a method to just calculate it /// without modifying anything anyway. -async fn tenant_size_handler(request: Request) -> Result, ApiError> { +async fn tenant_size_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let inputs_only: Option = parse_query_param(&request, "inputs_only")?; @@ -619,7 +689,10 @@ async fn tenant_size_handler(request: Request) -> Result, A ) } -async fn layer_map_info_handler(request: Request) -> Result, ApiError> { +async fn layer_map_info_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let reset: LayerAccessStatsReset = @@ -633,7 +706,10 @@ async fn layer_map_info_handler(request: Request) -> Result json_response(StatusCode::OK, layer_map_info) } -async fn layer_download_handler(request: Request) -> Result, ApiError> { +async fn layer_download_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -656,7 +732,10 @@ async fn layer_download_handler(request: Request) -> Result } } -async fn evict_timeline_layer_handler(request: Request) -> Result, ApiError> { +async fn evict_timeline_layer_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -734,7 +813,12 @@ pub fn html_response(status: StatusCode, data: String) -> Result, Ok(response) } -async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { +async fn tenant_create_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let request_data: TenantCreateRequest = json_request(&mut request).await?; + let target_tenant_id = request_data.new_tenant_id; check_permission(&request, None)?; let _timer = STORAGE_TIME_GLOBAL @@ -742,17 +826,10 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result) -> Result, ApiError> { +async fn get_tenant_config_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -812,6 +892,7 @@ async fn get_tenant_config_handler(request: Request) -> Result, + _cancel: CancellationToken, ) -> Result, ApiError> { let request_data: TenantConfigRequest = json_request(&mut request).await?; let tenant_id = request_data.tenant_id; @@ -829,8 +910,10 @@ async fn update_tenant_config_handler( } /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`]. -#[cfg(feature = "testing")] -async fn handle_tenant_break(r: Request) -> Result, ApiError> { +async fn handle_tenant_break( + r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?; let tenant = crate::tenant::mgr::get_tenant(tenant_id, true) @@ -842,8 +925,10 @@ async fn handle_tenant_break(r: Request) -> Result, ApiErro json_response(StatusCode::OK, ()) } -#[cfg(feature = "testing")] -async fn failpoints_handler(mut request: Request) -> Result, ApiError> { +async fn failpoints_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { if !fail::has_failpoints() { return Err(ApiError::BadRequest(anyhow!( "Cannot manage failpoints because pageserver was compiled without failpoints support" @@ -876,7 +961,10 @@ async fn failpoints_handler(mut request: Request) -> Result } // Run GC immediately on given timeline. -async fn timeline_gc_handler(mut request: Request) -> Result, ApiError> { +async fn timeline_gc_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; @@ -895,8 +983,10 @@ async fn timeline_gc_handler(mut request: Request) -> Result) -> Result, ApiError> { +async fn timeline_compact_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; @@ -917,8 +1007,10 @@ async fn timeline_compact_handler(request: Request) -> Result) -> Result, ApiError> { +async fn timeline_checkpoint_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; @@ -942,6 +1034,7 @@ async fn timeline_checkpoint_handler(request: Request) -> Result, + _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -957,6 +1050,7 @@ async fn timeline_download_remote_layers_handler_post( async fn timeline_download_remote_layers_handler_get( request: Request, + _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -981,7 +1075,10 @@ async fn active_timeline_of_active_tenant( .map_err(ApiError::NotFound) } -async fn always_panic_handler(req: Request) -> Result, ApiError> { +async fn always_panic_handler( + req: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook(). // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it. // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic. @@ -992,7 +1089,10 @@ async fn always_panic_handler(req: Request) -> Result, ApiE json_response(StatusCode::NO_CONTENT, ()) } -async fn disk_usage_eviction_run(mut r: Request) -> Result, ApiError> { +async fn disk_usage_eviction_run( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { check_permission(&r, None)?; #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)] @@ -1082,8 +1182,10 @@ async fn handler_404(_: Request) -> Result, ApiError> { ) } -#[cfg(feature = "testing")] -async fn post_tracing_event_handler(mut r: Request) -> Result, ApiError> { +async fn post_tracing_event_handler( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { #[derive(Debug, serde::Deserialize)] #[serde(rename_all = "lowercase")] enum Level { @@ -1113,6 +1215,85 @@ async fn post_tracing_event_handler(mut r: Request) -> Result(request: Request, handler: H) -> Result, ApiError> +where + R: std::future::Future, ApiError>> + Send + 'static, + H: FnOnce(Request, CancellationToken) -> R + Send + Sync + 'static, +{ + // Spawn a new task to handle the request, to protect the handler from unexpected + // async cancellations. Most pageserver functions are not async cancellation safe. + // We arm a drop-guard, so that if Hyper drops the Future, we signal the task + // with the cancellation token. + let token = CancellationToken::new(); + let cancel_guard = token.clone().drop_guard(); + let result = request_span(request, move |r| async { + let handle = tokio::spawn( + async { + let token_cloned = token.clone(); + let result = handler(r, token).await; + if token_cloned.is_cancelled() { + info!("Cancelled request finished"); + } + result + } + .in_current_span(), + ); + + match handle.await { + Ok(result) => result, + Err(e) => { + // The handler task panicked. We have a global panic handler that logs the + // panic with its backtrace, so no need to log that here. Only log a brief + // message to make it clear that we returned the error to the client. + error!("HTTP request handler task panicked: {e:#}"); + + // Don't return an Error here, because then fallback error handler that was + // installed in make_router() will print the error. Instead, construct the + // HTTP error response and return that. + Ok( + ApiError::InternalServerError(anyhow!("HTTP request handler task panicked")) + .into_response(), + ) + } + } + }) + .await; + + cancel_guard.disarm(); + + result +} + +/// Like api_handler, but returns an error response if the server is built without +/// the 'testing' feature. +async fn testing_api_handler( + desc: &str, + request: Request, + handler: H, +) -> Result, ApiError> +where + R: std::future::Future, ApiError>> + Send + 'static, + H: FnOnce(Request, CancellationToken) -> R + Send + Sync + 'static, +{ + if cfg!(feature = "testing") { + api_handler(request, handler).await + } else { + std::future::ready(Err(ApiError::BadRequest(anyhow!( + "Cannot {desc} because pageserver was compiled without testing APIs", + )))) + .await + } +} + pub fn make_router( conf: &'static PageServerConf, launch_ts: &'static LaunchTimestamp, @@ -1142,26 +1323,6 @@ pub fn make_router( .expect("construct launch timestamp header middleware"), ); - macro_rules! testing_api { - ($handler_desc:literal, $handler:path $(,)?) => {{ - #[cfg(not(feature = "testing"))] - async fn cfg_disabled(_req: Request) -> Result, ApiError> { - Err(ApiError::BadRequest(anyhow!(concat!( - "Cannot ", - $handler_desc, - " because pageserver was compiled without testing APIs", - )))) - } - - #[cfg(feature = "testing")] - let handler = $handler; - #[cfg(not(feature = "testing"))] - let handler = cfg_disabled; - - move |r| RequestSpan(handler).handle(r) - }}; - } - Ok(router .data(Arc::new( State::new( @@ -1173,96 +1334,88 @@ pub fn make_router( ) .context("Failed to initialize router state")?, )) - .get("/v1/status", |r| RequestSpan(status_handler).handle(r)) - .put( - "/v1/failpoints", - testing_api!("manage failpoints", failpoints_handler), - ) - .get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r)) - .post("/v1/tenant", |r| { - RequestSpan(tenant_create_handler).handle(r) - }) - .get("/v1/tenant/:tenant_id", |r| { - RequestSpan(tenant_status).handle(r) + .get("/v1/status", |r| api_handler(r, status_handler)) + .put("/v1/failpoints", |r| { + testing_api_handler("manage failpoints", r, failpoints_handler) }) + .get("/v1/tenant", |r| api_handler(r, tenant_list_handler)) + .post("/v1/tenant", |r| api_handler(r, tenant_create_handler)) + .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status)) .get("/v1/tenant/:tenant_id/synthetic_size", |r| { - RequestSpan(tenant_size_handler).handle(r) + api_handler(r, tenant_size_handler) }) .put("/v1/tenant/config", |r| { - RequestSpan(update_tenant_config_handler).handle(r) + api_handler(r, update_tenant_config_handler) }) .get("/v1/tenant/:tenant_id/config", |r| { - RequestSpan(get_tenant_config_handler).handle(r) + api_handler(r, get_tenant_config_handler) }) .get("/v1/tenant/:tenant_id/timeline", |r| { - RequestSpan(timeline_list_handler).handle(r) + api_handler(r, timeline_list_handler) }) .post("/v1/tenant/:tenant_id/timeline", |r| { - RequestSpan(timeline_create_handler).handle(r) + api_handler(r, timeline_create_handler) }) .post("/v1/tenant/:tenant_id/attach", |r| { - RequestSpan(tenant_attach_handler).handle(r) + api_handler(r, tenant_attach_handler) }) .post("/v1/tenant/:tenant_id/detach", |r| { - RequestSpan(tenant_detach_handler).handle(r) + api_handler(r, tenant_detach_handler) }) .post("/v1/tenant/:tenant_id/load", |r| { - RequestSpan(tenant_load_handler).handle(r) + api_handler(r, tenant_load_handler) }) .post("/v1/tenant/:tenant_id/ignore", |r| { - RequestSpan(tenant_ignore_handler).handle(r) + api_handler(r, tenant_ignore_handler) }) .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { - RequestSpan(timeline_detail_handler).handle(r) + api_handler(r, timeline_detail_handler) }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp", - |r| RequestSpan(get_lsn_by_timestamp_handler).handle(r), + |r| api_handler(r, get_lsn_by_timestamp_handler), ) .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| { - RequestSpan(timeline_gc_handler).handle(r) + api_handler(r, timeline_gc_handler) + }) + .put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| { + testing_api_handler("run timeline compaction", r, timeline_compact_handler) }) - .put( - "/v1/tenant/:tenant_id/timeline/:timeline_id/compact", - testing_api!("run timeline compaction", timeline_compact_handler), - ) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", - testing_api!("run timeline checkpoint", timeline_checkpoint_handler), + |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler), ) .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", - |r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r), + |r| api_handler(r, timeline_download_remote_layers_handler_post), ) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", - |r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r), + |r| api_handler(r, timeline_download_remote_layers_handler_get), ) .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { - RequestSpan(timeline_delete_handler).handle(r) + api_handler(r, timeline_delete_handler) }) .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| { - RequestSpan(layer_map_info_handler).handle(r) + api_handler(r, layer_map_info_handler) }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", - |r| RequestSpan(layer_download_handler).handle(r), + |r| api_handler(r, layer_download_handler), ) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", - |r| RequestSpan(evict_timeline_layer_handler).handle(r), + |r| api_handler(r, evict_timeline_layer_handler), ) .put("/v1/disk_usage_eviction/run", |r| { - RequestSpan(disk_usage_eviction_run).handle(r) + api_handler(r, disk_usage_eviction_run) + }) + .put("/v1/tenant/:tenant_id/break", |r| { + testing_api_handler("set tenant state to broken", r, handle_tenant_break) + }) + .get("/v1/panic", |r| api_handler(r, always_panic_handler)) + .post("/v1/tracing/event", |r| { + testing_api_handler("emit a tracing event", r, post_tracing_event_handler) }) - .put( - "/v1/tenant/:tenant_id/break", - testing_api!("set tenant state to broken", handle_tenant_break), - ) - .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r)) - .post( - "/v1/tracing/event", - testing_api!("emit a tracing event", post_tracing_event_handler), - ) .any(handler_404)) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 4349f0e2ea..5831091098 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -35,7 +35,7 @@ use tracing::info; /// backwards-compatible changes to the metadata format. pub const STORAGE_FORMAT_VERSION: u16 = 3; -pub const DEFAULT_PG_VERSION: u32 = 14; +pub const DEFAULT_PG_VERSION: u32 = 15; // Magic constants used to identify different kinds of files pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; @@ -45,6 +45,7 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; +#[tracing::instrument] pub async fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint task. This prevents new connections from // being accepted. @@ -57,12 +58,6 @@ pub async fn shutdown_pageserver(exit_code: i32) { // the checkpoint and GC tasks. tenant::mgr::shutdown_all_tenants().await; - // Stop syncing with remote storage. - // - // FIXME: Does this wait for the sync tasks to finish syncing what's queued up? - // Should it? - task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), None, None).await; - // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. // FIXME: We should probably stop accepting commands like attach/detach earlier. @@ -137,6 +132,29 @@ pub fn is_uninit_mark(path: &Path) -> bool { } } +/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by +/// blocking. +/// +/// The instances of this value exist only during startup, otherwise `None` is provided, meaning no +/// delaying is needed. +#[derive(Clone)] +pub struct InitializationOrder { + /// Each initial tenant load task carries this until completion. + pub initial_tenant_load: Option, + + /// Barrier for when we can start initial logical size calculations. + pub initial_logical_size_can_start: utils::completion::Barrier, + + /// Each timeline owns a clone of this to be consumed on the initial logical size calculation + /// attempt. It is important to drop this once the attempt has completed. + pub initial_logical_size_attempt: utils::completion::Completion, + + /// Barrier for when we can start any background jobs. + /// + /// This can be broken up later on, but right now there is just one class of a background job. + pub background_jobs_can_start: utils::completion::Barrier, +} + #[cfg(test)] mod backoff_defaults_tests { use super::*; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 75bea9dbab..cc444c479a 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -84,6 +84,16 @@ pub static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static READ_NUM_FS_LAYERS: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_read_num_fs_layers", + "Number of persistent layers accessed for processing a read request, including those in the cache", + &["tenant_id", "timeline_id"], + vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0], + ) + .expect("failed to define a metric") +}); + // Metrics collected on operations on the storage repository. static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( @@ -95,6 +105,25 @@ static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_materialized_cache_hits_direct_total", + "Number of cache hits from materialized page cache without redo", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_getpage_get_reconstruct_data_seconds", + "Time spent in get_reconstruct_value_data", + &["tenant_id", "timeline_id"], + CRITICAL_OP_BUCKETS.into(), + ) + .expect("failed to define a metric") +}); + static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_materialized_cache_hits_total", @@ -354,6 +383,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ 0.001000, // 1000 usec 0.030, // 30 ms 1.000, // 1000 ms + 30.000, // 30000 ms ]; const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[ @@ -622,7 +652,7 @@ pub static WAL_REDO_TIME: Lazy = Lazy::new(|| { pub static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_wait_seconds", - "Time spent waiting for access to the WAL redo process", + "Time spent waiting for access to the Postgres WAL redo process", redo_histogram_time_buckets!(), ) .expect("failed to define a metric") @@ -631,7 +661,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_records_histogram", - "Histogram of number of records replayed per redo", + "Histogram of number of records replayed per redo in the Postgres WAL redo process", redo_histogram_count_buckets!(), ) .expect("failed to define a metric") @@ -640,7 +670,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { pub static WAL_REDO_BYTES_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_bytes_histogram", - "Histogram of number of records replayed per redo", + "Histogram of number of records replayed per redo sent to Postgres", redo_bytes_histogram_count_buckets!(), ) .expect("failed to define a metric") @@ -723,7 +753,9 @@ pub struct TimelineMetrics { tenant_id: String, timeline_id: String, pub reconstruct_time_histo: Histogram, + pub get_reconstruct_data_time_histo: Histogram, pub materialized_page_cache_hit_counter: GenericCounter, + pub materialized_page_cache_hit_upon_request_counter: GenericCounter, pub flush_time_histo: StorageTimeMetrics, pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, @@ -734,6 +766,7 @@ pub struct TimelineMetrics { pub last_record_gauge: IntGauge, pub wait_lsn_time_histo: Histogram, pub resident_physical_size_gauge: UIntGauge, + pub read_num_fs_layers: Histogram, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub num_persistent_files_created: IntCounter, @@ -753,6 +786,9 @@ impl TimelineMetrics { let reconstruct_time_histo = RECONSTRUCT_TIME .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); + let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); @@ -794,6 +830,12 @@ impl TimelineMetrics { let evictions = EVICTIONS .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); + let read_num_fs_layers = READ_NUM_FS_LAYERS + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id); @@ -801,7 +843,9 @@ impl TimelineMetrics { tenant_id, timeline_id, reconstruct_time_histo, + get_reconstruct_data_time_histo, materialized_page_cache_hit_counter, + materialized_page_cache_hit_upon_request_counter, flush_time_histo, compact_time_histo, create_images_time_histo, @@ -819,6 +863,7 @@ impl TimelineMetrics { evictions_with_low_residence_duration: std::sync::RwLock::new( evictions_with_low_residence_duration, ), + read_num_fs_layers, } } } @@ -828,7 +873,9 @@ impl Drop for TimelineMetrics { let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]); + let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]); let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]); + let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]); let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]); let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); @@ -836,6 +883,8 @@ impl Drop for TimelineMetrics { let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]); + let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]); + self.evictions_with_low_residence_duration .write() .unwrap() diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 03799553a0..d25463fe3e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -50,7 +50,9 @@ use crate::import_datadir::import_wal_from_tar; use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::task_mgr; use crate::task_mgr::TaskKind; +use crate::tenant; use crate::tenant::mgr; +use crate::tenant::mgr::GetTenantError; use crate::tenant::{Tenant, Timeline}; use crate::trace::Tracer; @@ -1150,7 +1152,9 @@ enum GetActiveTenantError { wait_time: Duration, }, #[error(transparent)] - Other(#[from] anyhow::Error), + NotFound(GetTenantError), + #[error(transparent)] + WaitTenantActive(tenant::WaitToBecomeActiveError), } impl From for QueryError { @@ -1159,7 +1163,8 @@ impl From for QueryError { GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected( ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())), ), - GetActiveTenantError::Other(e) => QueryError::Other(e), + GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)), + GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)), } } } @@ -1175,13 +1180,16 @@ async fn get_active_tenant_with_timeout( ) -> Result, GetActiveTenantError> { let tenant = match mgr::get_tenant(tenant_id, false).await { Ok(tenant) => tenant, - Err(e) => return Err(GetActiveTenantError::Other(e.into())), + Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)), + Err(GetTenantError::NotActive(_)) => { + unreachable!("we're calling get_tenant with active=false") + } }; let wait_time = Duration::from_secs(30); match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await { Ok(Ok(())) => Ok(tenant), // no .context(), the error message is good enough and some tests depend on it - Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)), + Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)), Err(_) => { let latest_state = tenant.current_state(); if latest_state == TenantState::Active { @@ -1196,13 +1204,35 @@ async fn get_active_tenant_with_timeout( } } +#[derive(Debug, thiserror::Error)] +enum GetActiveTimelineError { + #[error(transparent)] + Tenant(GetActiveTenantError), + #[error(transparent)] + Timeline(anyhow::Error), +} + +impl From for QueryError { + fn from(e: GetActiveTimelineError) -> Self { + match e { + GetActiveTimelineError::Tenant(e) => e.into(), + GetActiveTimelineError::Timeline(e) => QueryError::Other(e), + } + } +} + /// Shorthand for getting a reference to a Timeline of an Active tenant. async fn get_active_tenant_timeline( tenant_id: TenantId, timeline_id: TimelineId, ctx: &RequestContext, -) -> Result, GetActiveTenantError> { - let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?; - let timeline = tenant.get_timeline(timeline_id, true).await?; +) -> Result, GetActiveTimelineError> { + let tenant = get_active_tenant_with_timeout(tenant_id, ctx) + .await + .map_err(GetActiveTimelineError::Tenant)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .await + .map_err(GetActiveTimelineError::Timeline)?; Ok(timeline) } diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 82aebc6c07..4df0e4e6f2 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -476,18 +476,35 @@ pub async fn shutdown_tasks( && (timeline_id.is_none() || task_mut.timeline_id == timeline_id) { task.cancel.cancel(); - victim_tasks.push(Arc::clone(task)); + victim_tasks.push(( + Arc::clone(task), + task.kind, + task_mut.tenant_id, + task_mut.timeline_id, + )); } } } - for task in victim_tasks { + let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none(); + + for (task, task_kind, tenant_id, timeline_id) in victim_tasks { let join_handle = { let mut task_mut = task.mutable.lock().unwrap(); task_mut.join_handle.take() }; if let Some(mut join_handle) = join_handle { + if log_all { + if tenant_id.is_none() { + // there are quite few of these + info!(name = task.name, kind = ?task_kind, "stopping global task"); + } else { + // warn to catch these in tests; there shouldn't be any + warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); + } + } let completed = tokio::select! { + biased; _ = &mut join_handle => { true }, _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => { // allow some time to elapse before logging to cut down the number of log diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7f589cb971..94376b2ac8 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -20,6 +20,7 @@ use storage_broker::BrokerClientChannel; use tokio::sync::watch; use tokio::task::JoinSet; use tracing::*; +use utils::completion; use utils::crashsafe::path_with_suffix_extension; use std::cmp::min; @@ -63,6 +64,7 @@ use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; use crate::tenant::storage_layer::Layer; +use crate::InitializationOrder; use crate::virtual_file::VirtualFile; use crate::walredo::PostgresRedoManager; @@ -77,7 +79,7 @@ use utils::{ lsn::{Lsn, RecordLsn}, }; -mod blob_io; +pub mod blob_io; pub mod block_io; pub mod disk_btree; pub(crate) mod ephemeral_file; @@ -268,7 +270,7 @@ impl UninitializedTimeline<'_> { let tl = self .initialize_with_lock(ctx, &mut timelines, false) .await?; - tl.activate(broker_client, ctx); + tl.activate(broker_client, None, ctx); Ok(tl) } @@ -449,11 +451,48 @@ pub enum DeleteTimelineError { Other(#[from] anyhow::Error), } +pub enum SetStoppingError { + AlreadyStopping, + Broken, +} + struct RemoteStartupData { index_part: IndexPart, remote_metadata: TimelineMetadata, } +#[derive(Debug, thiserror::Error)] +pub(crate) enum WaitToBecomeActiveError { + WillNotBecomeActive { + tenant_id: TenantId, + state: TenantState, + }, + TenantDropped { + tenant_id: TenantId, + }, +} + +impl std::fmt::Display for WaitToBecomeActiveError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + WaitToBecomeActiveError::WillNotBecomeActive { tenant_id, state } => { + write!( + f, + "Tenant {} will not become active. Current state: {:?}", + tenant_id, state + ) + } + WaitToBecomeActiveError::TenantDropped { tenant_id } => { + write!(f, "Tenant {tenant_id} will not become active (dropped)") + } + } + } +} + +pub(crate) enum ShutdownError { + AlreadyStopping, +} + impl Tenant { /// Yet another helper for timeline initialization. /// Contains the common part of `load_local_timeline` and `load_remote_timeline`. @@ -474,6 +513,7 @@ impl Tenant { local_metadata: Option, ancestor: Option>, first_save: bool, + init_order: Option<&InitializationOrder>, ctx: &RequestContext, ) -> anyhow::Result<()> { let tenant_id = self.tenant_id; @@ -499,6 +539,7 @@ impl Tenant { up_to_date_metadata, ancestor.clone(), remote_client, + init_order, )?; let timeline = UninitializedTimeline { @@ -527,6 +568,7 @@ impl Tenant { up_to_date_metadata, ancestor.clone(), None, + None, ) .with_context(|| { format!("creating broken timeline data for {tenant_id}/{timeline_id}") @@ -622,16 +664,17 @@ impl Tenant { "attach tenant", false, async move { - let doit = async { - tenant_clone.attach(&ctx).await?; - tenant_clone.activate(broker_client, &ctx).await?; - anyhow::Ok(()) - }; - match doit.await { - Ok(_) => {} + match tenant_clone.attach(&ctx).await { + Ok(()) => { + info!("attach finished, activating"); + tenant_clone.activate(broker_client, None, &ctx).await; + } Err(e) => { - tenant_clone.set_broken(e.to_string()).await; - error!("error attaching tenant: {:?}", e); + error!("attach failed, setting tenant state to Broken: {:?}", e); + tenant_clone.state.send_modify(|state| { + assert_eq!(*state, TenantState::Attaching, "the attach task owns the tenant state until activation is complete"); + *state = TenantState::broken_from_reason(e.to_string()); + }); } } Ok(()) @@ -648,6 +691,8 @@ impl Tenant { /// /// Background task that downloads all data for a tenant and brings it to Active state. /// + /// No background tasks are started as part of this routine. + /// async fn attach(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { debug_assert_current_span_has_tenant_id(); @@ -822,6 +867,7 @@ impl Tenant { local_metadata, ancestor, true, + None, ctx, ) .await @@ -847,7 +893,6 @@ impl Tenant { )) } - /// /// Load a tenant that's available on local disk /// /// This is used at pageserver startup, to rebuild the in-memory @@ -857,15 +902,17 @@ impl Tenant { /// /// If the loading fails for some reason, the Tenant will go into Broken /// state. - /// - #[instrument(skip(conf, remote_storage, ctx), fields(tenant_id=%tenant_id))] + #[instrument(skip_all, fields(tenant_id=%tenant_id))] pub fn spawn_load( conf: &'static PageServerConf, tenant_id: TenantId, broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, + init_order: Option, ctx: &RequestContext, ) -> Arc { + debug_assert_current_span_has_tenant_id(); + let tenant_conf = match Self::load_tenant_config(conf, tenant_id) { Ok(conf) => conf, Err(e) => { @@ -897,20 +944,27 @@ impl Tenant { "initial tenant load", false, async move { - let doit = async { - tenant_clone.load(&ctx).await?; - tenant_clone.activate(broker_client, &ctx).await?; - anyhow::Ok(()) - }; - match doit.await { - Ok(()) => {} + let mut init_order = init_order; + + // take the completion because initial tenant loading will complete when all of + // these tasks complete. + let _completion = init_order.as_mut().and_then(|x| x.initial_tenant_load.take()); + + match tenant_clone.load(init_order.as_ref(), &ctx).await { + Ok(()) => { + debug!("load finished, activating"); + let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start); + tenant_clone.activate(broker_client, background_jobs_can_start, &ctx).await; + } Err(err) => { - tenant_clone.set_broken(err.to_string()).await; - error!("could not load tenant {tenant_id}: {err:?}"); + error!("load failed, setting tenant state to Broken: {err:?}"); + tenant_clone.state.send_modify(|state| { + assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete"); + *state = TenantState::broken_from_reason(err.to_string()); + }); } } - info!("initial load for tenant {tenant_id} finished!"); - Ok(()) + Ok(()) } .instrument({ let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id); @@ -919,8 +973,6 @@ impl Tenant { }), ); - info!("spawned load into background"); - tenant } @@ -928,10 +980,15 @@ impl Tenant { /// Background task to load in-memory data structures for this tenant, from /// files on disk. Used at pageserver startup. /// - async fn load(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { + /// No background tasks are started as part of this routine. + async fn load( + self: &Arc, + init_order: Option<&InitializationOrder>, + ctx: &RequestContext, + ) -> anyhow::Result<()> { debug_assert_current_span_has_tenant_id(); - info!("loading tenant task"); + debug!("loading tenant task"); utils::failpoint_sleep_millis_async!("before-loading-tenant"); @@ -941,112 +998,119 @@ impl Tenant { // // Scan the directory, peek into the metadata file of each timeline, and // collect a list of timelines and their ancestors. - let mut timelines_to_load: HashMap = HashMap::new(); - let timelines_dir = self.conf.timelines_path(&self.tenant_id); - for entry in std::fs::read_dir(&timelines_dir).with_context(|| { - format!( - "Failed to list timelines directory for tenant {}", - self.tenant_id - ) - })? { - let entry = entry.with_context(|| { - format!("cannot read timeline dir entry for {}", self.tenant_id) - })?; - let timeline_dir = entry.path(); + let tenant_id = self.tenant_id; + let conf = self.conf; + let span = info_span!("blocking"); - if crate::is_temporary(&timeline_dir) { - info!( - "Found temporary timeline directory, removing: {}", - timeline_dir.display() - ); - if let Err(e) = std::fs::remove_dir_all(&timeline_dir) { - error!( - "Failed to remove temporary directory '{}': {:?}", - timeline_dir.display(), - e + let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || { + let _g = span.entered(); + let mut timelines_to_load: HashMap = HashMap::new(); + let timelines_dir = conf.timelines_path(&tenant_id); + + for entry in + std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")? + { + let entry = entry.context("read timeline dir entry")?; + let timeline_dir = entry.path(); + + if crate::is_temporary(&timeline_dir) { + info!( + "Found temporary timeline directory, removing: {}", + timeline_dir.display() ); - } - } else if is_uninit_mark(&timeline_dir) { - let timeline_uninit_mark_file = &timeline_dir; - info!( - "Found an uninit mark file {}, removing the timeline and its uninit mark", - timeline_uninit_mark_file.display() - ); - let timeline_id = timeline_uninit_mark_file - .file_stem() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .with_context(|| { - format!( + if let Err(e) = std::fs::remove_dir_all(&timeline_dir) { + error!( + "Failed to remove temporary directory '{}': {:?}", + timeline_dir.display(), + e + ); + } + } else if is_uninit_mark(&timeline_dir) { + let timeline_uninit_mark_file = &timeline_dir; + info!( + "Found an uninit mark file {}, removing the timeline and its uninit mark", + timeline_uninit_mark_file.display() + ); + let timeline_id = timeline_uninit_mark_file + .file_stem() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( "Could not parse timeline id out of the timeline uninit mark name {}", timeline_uninit_mark_file.display() ) - })?; - let timeline_dir = self.conf.timeline_path(&timeline_id, &self.tenant_id); - if let Err(e) = - remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) - { - error!("Failed to clean up uninit marked timeline: {e:?}"); - } - } else { - let timeline_id = timeline_dir - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline dir name {}", - timeline_dir.display() - ) - })?; - let timeline_uninit_mark_file = self - .conf - .timeline_uninit_mark_file_path(self.tenant_id, timeline_id); - if timeline_uninit_mark_file.exists() { - info!( - "Found an uninit mark file for timeline {}/{}, removing the timeline and its uninit mark", - self.tenant_id, timeline_id - ); + })?; + let timeline_dir = conf.timeline_path(&timeline_id, &tenant_id); if let Err(e) = - remove_timeline_and_uninit_mark(&timeline_dir, &timeline_uninit_mark_file) + remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) { error!("Failed to clean up uninit marked timeline: {e:?}"); } - continue; - } - - let file_name = entry.file_name(); - if let Ok(timeline_id) = - file_name.to_str().unwrap_or_default().parse::() - { - let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) - .context("failed to load metadata")?; - timelines_to_load.insert(timeline_id, metadata); } else { - // A file or directory that doesn't look like a timeline ID - warn!( - "unexpected file or directory in timelines directory: {}", - file_name.to_string_lossy() - ); + let timeline_id = timeline_dir + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( + "Could not parse timeline id out of the timeline dir name {}", + timeline_dir.display() + ) + })?; + let timeline_uninit_mark_file = + conf.timeline_uninit_mark_file_path(tenant_id, timeline_id); + if timeline_uninit_mark_file.exists() { + info!( + %timeline_id, + "Found an uninit mark file, removing the timeline and its uninit mark", + ); + if let Err(e) = remove_timeline_and_uninit_mark( + &timeline_dir, + &timeline_uninit_mark_file, + ) { + error!("Failed to clean up uninit marked timeline: {e:?}"); + } + continue; + } + + let file_name = entry.file_name(); + if let Ok(timeline_id) = + file_name.to_str().unwrap_or_default().parse::() + { + let metadata = load_metadata(conf, timeline_id, tenant_id) + .context("failed to load metadata")?; + timelines_to_load.insert(timeline_id, metadata); + } else { + // A file or directory that doesn't look like a timeline ID + warn!( + "unexpected file or directory in timelines directory: {}", + file_name.to_string_lossy() + ); + } } } - } - // Sort the array of timeline IDs into tree-order, so that parent comes before - // all its children. - let sorted_timelines = tree_sort_timelines(timelines_to_load)?; + // Sort the array of timeline IDs into tree-order, so that parent comes before + // all its children. + tree_sort_timelines(timelines_to_load) + }) + .await + .context("load spawn_blocking") + .and_then(|res| res)?; + // FIXME original collect_timeline_files contained one more check: // 1. "Timeline has no ancestor and no layer files" for (timeline_id, local_metadata) in sorted_timelines { - self.load_local_timeline(timeline_id, local_metadata, ctx) + self.load_local_timeline(timeline_id, local_metadata, init_order, ctx) .await .with_context(|| format!("load local timeline {timeline_id}"))?; } - info!("Done"); + trace!("Done"); Ok(()) } @@ -1059,6 +1123,7 @@ impl Tenant { &self, timeline_id: TimelineId, local_metadata: TimelineMetadata, + init_order: Option<&InitializationOrder>, ctx: &RequestContext, ) -> anyhow::Result<()> { debug_assert_current_span_has_tenant_id(); @@ -1128,6 +1193,7 @@ impl Tenant { Some(local_metadata), ancestor, false, + init_order, ctx, ) .await @@ -1316,7 +1382,7 @@ impl Tenant { } }; - loaded_timeline.activate(broker_client, ctx); + loaded_timeline.activate(broker_client, None, ctx); if let Some(remote_client) = loaded_timeline.remote_client.as_ref() { // Wait for the upload of the 'index_part.json` file to finish, so that when we return @@ -1352,6 +1418,7 @@ impl Tenant { pitr: Duration, ctx: &RequestContext, ) -> anyhow::Result { + // there is a global allowed_error for this anyhow::ensure!( self.is_active(), "Cannot run GC iteration on inactive tenant" @@ -1395,31 +1462,70 @@ impl Tenant { Ok(()) } - /// Flush all in-memory data to disk. + /// Flush all in-memory data to disk and remote storage, if any. /// /// Used at graceful shutdown. - /// - pub async fn freeze_and_flush(&self) -> anyhow::Result<()> { - // Scan through the hashmap and collect a list of all the timelines, - // while holding the lock. Then drop the lock and actually perform the - // flushing. We don't want to block everything else while the - // flushing is performed. - let timelines_to_flush = { + async fn freeze_and_flush_on_shutdown(&self) { + let mut js = tokio::task::JoinSet::new(); + + // execute on each timeline on the JoinSet, join after. + let per_timeline = |timeline_id: TimelineId, timeline: Arc| { + async move { + debug_assert_current_span_has_tenant_and_timeline_id(); + + match timeline.freeze_and_flush().await { + Ok(()) => {} + Err(e) => { + warn!("failed to freeze and flush: {e:#}"); + return; + } + } + + let res = if let Some(client) = timeline.remote_client.as_ref() { + // if we did not wait for completion here, it might be our shutdown process + // didn't wait for remote uploads to complete at all, as new tasks can forever + // be spawned. + // + // what is problematic is the shutting down of RemoteTimelineClient, because + // obviously it does not make sense to stop while we wait for it, but what + // about corner cases like s3 suddenly hanging up? + client.wait_completion().await + } else { + Ok(()) + }; + + if let Err(e) = res { + warn!("failed to await for frozen and flushed uploads: {e:#}"); + } + } + .instrument(tracing::info_span!("freeze_and_flush_on_shutdown", %timeline_id)) + }; + + { let timelines = self.timelines.lock().await; timelines .iter() - .map(|(_id, timeline)| Arc::clone(timeline)) - .collect::>() + .map(|(id, tl)| (*id, Arc::clone(tl))) + .for_each(|(timeline_id, timeline)| { + js.spawn(per_timeline(timeline_id, timeline)); + }) }; - for timeline in &timelines_to_flush { - timeline.freeze_and_flush().await?; + while let Some(res) = js.join_next().await { + match res { + Ok(()) => {} + Err(je) if je.is_cancelled() => unreachable!("no cancelling used"), + Err(je) if je.is_panic() => { /* logged already */ } + Err(je) => warn!("unexpected JoinError: {je:?}"), + } } - - Ok(()) } - /// Removes timeline-related in-memory data + /// Shuts down a timeline's tasks, removes its in-memory structures, and deletes its + /// data from disk. + /// + /// This doesn't currently delete all data from S3, but sets a flag in its + /// index_part.json file to mark it as deleted. pub async fn delete_timeline( &self, timeline_id: TimelineId, @@ -1429,7 +1535,11 @@ impl Tenant { // Transition the timeline into TimelineState::Stopping. // This should prevent new operations from starting. - let timeline = { + // + // Also grab the Timeline's delete_lock to prevent another deletion from starting. + let timeline; + let mut delete_lock_guard; + { let mut timelines = self.timelines.lock().await; // Ensure that there are no child timelines **attached to that pageserver**, @@ -1447,20 +1557,36 @@ impl Tenant { Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound), }; - let timeline = Arc::clone(timeline_entry.get()); + timeline = Arc::clone(timeline_entry.get()); + + // Prevent two tasks from trying to delete the timeline at the same time. + // + // XXX: We should perhaps return an HTTP "202 Accepted" to signal that the caller + // needs to poll until the operation has finished. But for now, we return an + // error, because the control plane knows to retry errors. + delete_lock_guard = timeline.delete_lock.try_lock().map_err(|_| { + DeleteTimelineError::Other(anyhow::anyhow!( + "timeline deletion is already in progress" + )) + })?; + + // If another task finished the deletion just before we acquired the lock, + // return success. + if *delete_lock_guard { + return Ok(()); + } + timeline.set_state(TimelineState::Stopping); drop(timelines); - timeline - }; + } // Now that the Timeline is in Stopping state, request all the related tasks to // shut down. // - // NB: If you call delete_timeline multiple times concurrently, they will - // all go through the motions here. Make sure the code here is idempotent, - // and don't error out if some of the shutdown tasks have already been - // completed! + // NB: If this fails half-way through, and is retried, the retry will go through + // all the same steps again. Make sure the code here is idempotent, and don't + // error out if some of the shutdown tasks have already been completed! // Stop the walreceiver first. debug!("waiting for wal receiver to shutdown"); @@ -1501,6 +1627,10 @@ impl Tenant { // If we (now, or already) marked it successfully as deleted, we can proceed Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), // Bail out otherwise + // + // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents + // two tasks from performing the deletion at the same time. The first task + // that starts deletion should run it to completion. Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); @@ -1511,14 +1641,12 @@ impl Tenant { { // Grab the layer_removal_cs lock, and actually perform the deletion. // - // This lock prevents multiple concurrent delete_timeline calls from - // stepping on each other's toes, while deleting the files. It also - // prevents GC or compaction from running at the same time. + // This lock prevents prevents GC or compaction from running at the same time. + // The GC task doesn't register itself with the timeline it's operating on, + // so it might still be running even though we called `shutdown_tasks`. // // Note that there are still other race conditions between - // GC, compaction and timeline deletion. GC task doesn't - // register itself properly with the timeline it's - // operating on. See + // GC, compaction and timeline deletion. See // https://github.com/neondatabase/neon/issues/2671 // // No timeout here, GC & Compaction should be responsive to the @@ -1580,37 +1708,27 @@ impl Tenant { }); // Remove the timeline from the map. - let mut timelines = self.timelines.lock().await; - let children_exist = timelines - .iter() - .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); - // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. - // We already deleted the layer files, so it's probably best to panic. - // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) - if children_exist { - panic!("Timeline grew children while we removed layer files"); + { + let mut timelines = self.timelines.lock().await; + + let children_exist = timelines + .iter() + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); + // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. + // We already deleted the layer files, so it's probably best to panic. + // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) + if children_exist { + panic!("Timeline grew children while we removed layer files"); + } + + timelines.remove(&timeline_id).expect( + "timeline that we were deleting was concurrently removed from 'timelines' map", + ); } - let removed_timeline = timelines.remove(&timeline_id); - if removed_timeline.is_none() { - // This can legitimately happen if there's a concurrent call to this function. - // T1 T2 - // lock - // unlock - // lock - // unlock - // remove files - // lock - // remove from map - // unlock - // return - // remove files - // lock - // remove from map observes empty map - // unlock - // return - debug!("concurrent call to this function won the race"); - } - drop(timelines); + + // All done! Mark the deletion as completed and release the delete_lock + *delete_lock_guard = true; + drop(delete_lock_guard); Ok(()) } @@ -1624,49 +1742,36 @@ impl Tenant { } /// Changes tenant status to active, unless shutdown was already requested. + /// + /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup + /// to delay background jobs. Background jobs can be started right away when None is given. async fn activate( - &self, + self: &Arc, broker_client: BrokerClientChannel, + background_jobs_can_start: Option<&completion::Barrier>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) { debug_assert_current_span_has_tenant_id(); - let mut result = Ok(()); let mut activating = false; self.state.send_modify(|current_state| { + use pageserver_api::models::ActivatingFrom; match &*current_state { - TenantState::Activating => { - // activate() was called on an already Activating tenant. Shouldn't happen. - result = Err(anyhow::anyhow!("Tenant is already activating")); + TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping => { + panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state); } - TenantState::Active => { - // activate() was called on an already Active tenant. Shouldn't happen. - result = Err(anyhow::anyhow!("Tenant is already active")); + TenantState::Loading => { + *current_state = TenantState::Activating(ActivatingFrom::Loading); } - TenantState::Broken { reason, .. } => { - // This shouldn't happen either - result = Err(anyhow::anyhow!( - "Could not activate tenant because it is in broken state due to: {reason}", - )); - } - TenantState::Stopping => { - // The tenant was detached, or system shutdown was requested, while we were - // loading or attaching the tenant. - info!("Tenant is already in Stopping state, skipping activation"); - } - TenantState::Loading | TenantState::Attaching => { - *current_state = TenantState::Activating; - debug!(tenant_id = %self.tenant_id, "Activating tenant"); - activating = true; - // Continue outside the closure. We need to grab timelines.lock() - // and we plan to turn it into a tokio::sync::Mutex in a future patch. + TenantState::Attaching => { + *current_state = TenantState::Activating(ActivatingFrom::Attaching); } } + debug!(tenant_id = %self.tenant_id, "Activating tenant"); + activating = true; + // Continue outside the closure. We need to grab timelines.lock() + // and we plan to turn it into a tokio::sync::Mutex in a future patch. }); - if let Err(e) = result { - assert!(!activating, "transition into Activating is infallible"); - return Err(e); - } if activating { let timelines_accessor = self.timelines.lock().await; @@ -1676,18 +1781,18 @@ impl Tenant { // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. - tasks::start_background_loops(self.tenant_id); + tasks::start_background_loops(self, background_jobs_can_start); let mut activated_timelines = 0; for timeline in not_broken_timelines { - timeline.activate(broker_client.clone(), ctx); + timeline.activate(broker_client.clone(), background_jobs_can_start, ctx); activated_timelines += 1; } self.state.send_modify(move |current_state| { assert!( - *current_state == TenantState::Activating, + matches!(current_state, TenantState::Activating(_)), "set_stopping and set_broken wait for us to leave Activating state", ); *current_state = TenantState::Active; @@ -1706,96 +1811,188 @@ impl Tenant { "activation attempt finished" ); }); + } + } + + /// Shutdown the tenant and join all of the spawned tasks. + /// + /// The method caters for all use-cases: + /// - pageserver shutdown (freeze_and_flush == true) + /// - detach + ignore (freeze_and_flush == false) + /// + /// This will attempt to shutdown even if tenant is broken. + pub(crate) async fn shutdown(&self, freeze_and_flush: bool) -> Result<(), ShutdownError> { + debug_assert_current_span_has_tenant_id(); + // Set tenant (and its timlines) to Stoppping state. + // + // Since we can only transition into Stopping state after activation is complete, + // run it in a JoinSet so all tenants have a chance to stop before we get SIGKILLed. + // + // Transitioning tenants to Stopping state has a couple of non-obvious side effects: + // 1. Lock out any new requests to the tenants. + // 2. Signal cancellation to WAL receivers (we wait on it below). + // 3. Signal cancellation for other tenant background loops. + // 4. ??? + // + // The waiting for the cancellation is not done uniformly. + // We certainly wait for WAL receivers to shut down. + // That is necessary so that no new data comes in before the freeze_and_flush. + // But the tenant background loops are joined-on in our caller. + // It's mesed up. + // we just ignore the failure to stop + match self.set_stopping().await { + Ok(()) => {} + Err(SetStoppingError::Broken) => { + // assume that this is acceptable + } + Err(SetStoppingError::AlreadyStopping) => return Err(ShutdownError::AlreadyStopping), }; + + if freeze_and_flush { + // walreceiver has already began to shutdown with TenantState::Stopping, but we need to + // await for them to stop. + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(self.tenant_id), + None, + ) + .await; + + // this will wait for uploads to complete; in the past, it was done outside tenant + // shutdown in pageserver::shutdown_pageserver. + self.freeze_and_flush_on_shutdown().await; + } + + // shutdown all tenant and timeline tasks: gc, compaction, page service + // No new tasks will be started for this tenant because it's in `Stopping` state. + // + // this will additionally shutdown and await all timeline tasks. + task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await; + Ok(()) } /// Change tenant status to Stopping, to mark that it is being shut down. /// + /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state. + /// /// This function is not cancel-safe! - pub async fn set_stopping(&self) { - // Get the rx before checking state inside send_if_modified. - // This way, when we later rx.changed().await, we won't have missed - // any state changes. + async fn set_stopping(&self) -> Result<(), SetStoppingError> { let mut rx = self.state.subscribe(); - while *rx.borrow() == TenantState::Activating { - rx.changed() - .await - .expect("we're a method on Tenant, so, we're keeping self.state alive here"); - } - let mut stopping = false; - self.state.send_modify(|current_state| { - match current_state { - TenantState::Activating => unreachable!("we checked above and never transition back into Activating state"), - // FIXME: If the tenant is still Loading or Attaching, new timelines - // might be created after this. That's harmless, as the Timelines - // won't be accessible to anyone, when the Tenant is in Stopping - // state. - TenantState::Active | TenantState::Loading | TenantState::Attaching => { - *current_state = TenantState::Stopping; - stopping = true; - // Continue outside the closure. We need to grab timelines.lock() - // and we plan to turn it into a tokio::sync::Mutex in a future patch. - } - TenantState::Broken { reason, .. } => { - info!("Cannot set tenant to Stopping state, it is in Broken state due to: {reason}"); - } - TenantState::Stopping => { - // The tenant was detached, or system shutdown was requested, while we were - // loading or attaching the tenant. - info!("Tenant is already in Stopping state"); - } + + // cannot stop before we're done activating, so wait out until we're done activating + rx.wait_for(|state| match state { + TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => { + info!( + "waiting for {} to turn Active|Broken|Stopping", + <&'static str>::from(state) + ); + false + } + TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping {} => true, + }) + .await + .expect("cannot drop self.state while on a &self method"); + + // we now know we're done activating, let's see whether this task is the winner to transition into Stopping + let mut err = None; + let stopping = self.state.send_if_modified(|current_state| match current_state { + TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => { + unreachable!("we ensured above that we're done with activation, and, there is no re-activation") + } + TenantState::Active => { + // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines + // are created after the transition to Stopping. That's harmless, as the Timelines + // won't be accessible to anyone afterwards, because the Tenant is in Stopping state. + *current_state = TenantState::Stopping; + // Continue stopping outside the closure. We need to grab timelines.lock() + // and we plan to turn it into a tokio::sync::Mutex in a future patch. + true + } + TenantState::Broken { reason, .. } => { + info!( + "Cannot set tenant to Stopping state, it is in Broken state due to: {reason}" + ); + err = Some(SetStoppingError::Broken); + false + } + TenantState::Stopping => { + info!("Tenant is already in Stopping state"); + err = Some(SetStoppingError::AlreadyStopping); + false } }); - - if stopping { - let timelines_accessor = self.timelines.lock().await; - let not_broken_timelines = timelines_accessor - .values() - .filter(|timeline| timeline.current_state() != TimelineState::Broken); - for timeline in not_broken_timelines { - timeline.set_state(TimelineState::Stopping); - } + match (stopping, err) { + (true, None) => {} // continue + (false, Some(err)) => return Err(err), + (true, Some(_)) => unreachable!( + "send_if_modified closure must error out if not transitioning to Stopping" + ), + (false, None) => unreachable!( + "send_if_modified closure must return true if transitioning to Stopping" + ), } + + let timelines_accessor = self.timelines.lock().await; + let not_broken_timelines = timelines_accessor + .values() + .filter(|timeline| timeline.current_state() != TimelineState::Broken); + for timeline in not_broken_timelines { + timeline.set_state(TimelineState::Stopping); + } + Ok(()) } - pub async fn set_broken(&self, reason: String) { + /// Method for tenant::mgr to transition us into Broken state in case of a late failure in + /// `remove_tenant_from_memory` + /// + /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state. + /// + /// In tests, we also use this to set tenants to Broken state on purpose. + pub(crate) async fn set_broken(&self, reason: String) { let mut rx = self.state.subscribe(); - while *rx.borrow() == TenantState::Activating { - rx.changed() - .await - .expect("we're a method on Tenant, so, we're keeping self.state alive here"); - } + + // The load & attach routines own the tenant state until it has reached `Active`. + // So, wait until it's done. + rx.wait_for(|state| match state { + TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => { + info!( + "waiting for {} to turn Active|Broken|Stopping", + <&'static str>::from(state) + ); + false + } + TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping {} => true, + }) + .await + .expect("cannot drop self.state while on a &self method"); + + // we now know we're done activating, let's see whether this task is the winner to transition into Broken self.state.send_modify(|current_state| { match *current_state { - TenantState::Activating => { - unreachable!("we checked above and never transition back into Activating state") + TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => { + unreachable!("we ensured above that we're done with activation, and, there is no re-activation") } TenantState::Active => { - // Broken tenants can currently only used for fatal errors that happen - // while loading or attaching a tenant. A tenant that has already been - // activated should never be marked as broken. We cope with it the best - // we can, but it shouldn't happen. - warn!("Changing Active tenant to Broken state, reason: {}", reason); - *current_state = TenantState::broken_from_reason(reason); + if cfg!(feature = "testing") { + warn!("Changing Active tenant to Broken state, reason: {}", reason); + *current_state = TenantState::broken_from_reason(reason); + } else { + unreachable!("not allowed to call set_broken on Active tenants in non-testing builds") + } } TenantState::Broken { .. } => { - // This shouldn't happen either warn!("Tenant is already in Broken state"); } + // This is the only "expected" path, any other path is a bug. TenantState::Stopping => { - // This shouldn't happen either warn!( "Marking Stopping tenant as Broken state, reason: {}", reason ); *current_state = TenantState::broken_from_reason(reason); } - TenantState::Loading | TenantState::Attaching => { - info!("Setting tenant as Broken state, reason: {}", reason); - *current_state = TenantState::broken_from_reason(reason); - } - } + } }); } @@ -1803,25 +2000,30 @@ impl Tenant { self.state.subscribe() } - pub async fn wait_to_become_active(&self) -> anyhow::Result<()> { + pub(crate) async fn wait_to_become_active(&self) -> Result<(), WaitToBecomeActiveError> { let mut receiver = self.state.subscribe(); loop { let current_state = receiver.borrow_and_update().clone(); match current_state { - TenantState::Loading | TenantState::Attaching | TenantState::Activating => { + TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => { // in these states, there's a chance that we can reach ::Active - receiver.changed().await?; + receiver.changed().await.map_err( + |_e: tokio::sync::watch::error::RecvError| { + WaitToBecomeActiveError::TenantDropped { + tenant_id: self.tenant_id, + } + }, + )?; } TenantState::Active { .. } => { return Ok(()); } TenantState::Broken { .. } | TenantState::Stopping => { // There's no chance the tenant can transition back into ::Active - anyhow::bail!( - "Tenant {} will not become active. Current state: {:?}", - self.tenant_id, - ¤t_state, - ); + return Err(WaitToBecomeActiveError::WillNotBecomeActive { + tenant_id: self.tenant_id, + state: current_state, + }); } } } @@ -1975,6 +2177,7 @@ impl Tenant { new_metadata: &TimelineMetadata, ancestor: Option>, remote_client: Option, + init_order: Option<&InitializationOrder>, ) -> anyhow::Result> { if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() { anyhow::ensure!( @@ -1983,6 +2186,9 @@ impl Tenant { ) } + let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start); + let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt); + let pg_version = new_metadata.pg_version(); Ok(Timeline::new( self.conf, @@ -1994,6 +2200,8 @@ impl Tenant { Arc::clone(&self.walredo_mgr), remote_client, pg_version, + initial_logical_size_can_start.cloned(), + initial_logical_size_attempt.cloned(), )) } @@ -2675,7 +2883,7 @@ impl Tenant { remote_client: Option, ) -> anyhow::Result> { let timeline_data = self - .create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client) + .create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client, None) .context("Failed to create timeline data structure")?; crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?; @@ -3134,6 +3342,7 @@ pub mod harness { evictions_low_residence_duration_metric_threshold: Some( tenant_conf.evictions_low_residence_duration_metric_threshold, ), + gc_feedback: Some(tenant_conf.gc_feedback), } } } @@ -3242,7 +3451,7 @@ pub mod harness { timelines_to_load.insert(timeline_id, timeline_metadata); } tenant - .load(ctx) + .load(None, ctx) .instrument(info_span!("try_load", tenant_id=%self.tenant_id)) .await?; tenant.state.send_replace(TenantState::Active); diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 50de316bc4..80d153661a 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -99,6 +99,7 @@ pub struct TenantConf { // See the corresponding metric's help string. #[serde(with = "humantime_serde")] pub evictions_low_residence_duration_metric_threshold: Duration, + pub gc_feedback: bool, } /// Same as TenantConf, but this struct preserves the information about @@ -175,6 +176,10 @@ pub struct TenantConfOpt { #[serde(with = "humantime_serde")] #[serde(default)] pub evictions_low_residence_duration_metric_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub gc_feedback: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -242,6 +247,7 @@ impl TenantConfOpt { evictions_low_residence_duration_metric_threshold: self .evictions_low_residence_duration_metric_threshold .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold), + gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback), } } } @@ -278,6 +284,7 @@ impl Default for TenantConf { DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, ) .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), + gc_feedback: false, } } } @@ -372,6 +379,7 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { ))?, ); } + tenant_conf.gc_feedback = request_data.gc_feedback; Ok(tenant_conf) } diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index b63c361314..49dcbc63c2 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -204,6 +204,35 @@ fn test_off_by_one() { assert_eq!(version.image_coverage.query(5), None); } +/// White-box regression test, checking for incorrect removal of node at key.end +#[test] +fn test_regression() { + let mut map = HistoricLayerCoverage::::new(); + map.insert( + LayerKey { + key: 0..5, + lsn: 0..5, + is_image: false, + }, + "Layer 1".to_string(), + ); + map.insert( + LayerKey { + key: 0..5, + lsn: 1..2, + is_image: false, + }, + "Layer 2".to_string(), + ); + + // If an insertion operation improperly deletes the endpoint of a previous layer + // (which is more likely to happen with layers that collide on key.end), we will + // end up with an infinite layer, covering the entire keyspace. Here we assert + // that there's no layer at key 100 because we didn't insert any layer there. + let version = map.get_version(100).unwrap(); + assert_eq!(version.delta_coverage.query(100), None); +} + /// Cover edge cases where layers begin or end on the same key #[test] fn test_key_collision() { diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs index 4e3b4516dc..47aace97a5 100644 --- a/pageserver/src/tenant/layer_map/layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/layer_coverage.rs @@ -1,8 +1,8 @@ use std::ops::Range; -// TODO the `im` crate has 20x more downloads and also has -// persistent/immutable BTree. It also runs a bit faster but -// results are not the same on some tests. +// NOTE the `im` crate has 20x more downloads and also has +// persistent/immutable BTree. But it's bugged so rpds is a +// better choice https://github.com/neondatabase/neon/issues/3395 use rpds::RedBlackTreeMapSync; /// Data structure that can efficiently: @@ -10,19 +10,22 @@ use rpds::RedBlackTreeMapSync; /// - iterate the latest layers in a key range /// - insert layers in non-decreasing lsn.start order /// -/// The struct is parameterized over Value for easier -/// testing, but in practice it's some sort of layer. +/// For a detailed explanation and justification of this approach, see: +/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing +/// +/// NOTE The struct is parameterized over Value for easier +/// testing, but in practice it's some sort of layer. pub struct LayerCoverage { /// For every change in coverage (as we sweep the key space) /// we store (lsn.end, value). /// - /// We use an immutable/persistent tree so that we can keep historic - /// versions of this coverage without cloning the whole thing and - /// incurring quadratic memory cost. See HistoricLayerCoverage. + /// NOTE We use an immutable/persistent tree so that we can keep historic + /// versions of this coverage without cloning the whole thing and + /// incurring quadratic memory cost. See HistoricLayerCoverage. /// - /// We use the Sync version of the map because we want Self to - /// be Sync. Using nonsync might be faster, if we can work with - /// that. + /// NOTE We use the Sync version of the map because we want Self to + /// be Sync. Using nonsync might be faster, if we can work with + /// that. nodes: RedBlackTreeMapSync>, } @@ -41,6 +44,13 @@ impl LayerCoverage { /// Helper function to subdivide the key range without changing any values /// + /// This operation has no semantic effect by itself. It only helps us pin in + /// place the part of the coverage we don't want to change when inserting. + /// + /// As an analogy, think of a polygon. If you add a vertex along one of the + /// segments, the polygon is still the same, but it behaves differently when + /// we move or delete one of the other points. + /// /// Complexity: O(log N) fn add_node(&mut self, key: i128) { let value = match self.nodes.range(..=key).last() { @@ -74,7 +84,7 @@ impl LayerCoverage { let mut to_update = Vec::new(); let mut to_remove = Vec::new(); let mut prev_covered = false; - for (k, node) in self.nodes.range(key.clone()) { + for (k, node) in self.nodes.range(key) { let needs_cover = match node { None => true, Some((h, _)) => h < &lsn.end, @@ -87,9 +97,8 @@ impl LayerCoverage { } prev_covered = needs_cover; } - if !prev_covered { - to_remove.push(key.end); - } + // TODO check if the nodes inserted at key.start and key.end are safe + // to remove. It's fine to keep them but they could be redundant. for k in to_update { self.nodes.insert_mut(k, Some((lsn.end, value.clone()))); } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index b3be6061b3..5786db72f6 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -10,6 +10,7 @@ use tokio::fs; use anyhow::Context; use once_cell::sync::Lazy; use tokio::sync::RwLock; +use tokio::task::JoinSet; use tracing::*; use remote_storage::GenericRemoteStorage; @@ -20,7 +21,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::TenantConfOpt; use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState}; -use crate::IGNORED_TENANT_FILE_NAME; +use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME}; use utils::fs_ext::PathExt; use utils::id::{TenantId, TimelineId}; @@ -63,6 +64,7 @@ pub async fn init_tenant_mgr( conf: &'static PageServerConf, broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, + init_order: InitializationOrder, ) -> anyhow::Result<()> { // Scan local filesystem for attached tenants let tenants_dir = conf.tenants_path(); @@ -119,6 +121,7 @@ pub async fn init_tenant_mgr( &tenant_dir_path, broker_client.clone(), remote_storage.clone(), + Some(init_order.clone()), &ctx, ) { Ok(tenant) => { @@ -154,6 +157,7 @@ pub fn schedule_local_tenant_processing( tenant_path: &Path, broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, + init_order: Option, ctx: &RequestContext, ) -> anyhow::Result> { anyhow::ensure!( @@ -207,7 +211,14 @@ pub fn schedule_local_tenant_processing( } else { info!("tenant {tenant_id} is assumed to be loadable, starting load operation"); // Start loading the tenant into memory. It will initially be in Loading state. - Tenant::spawn_load(conf, tenant_id, broker_client, remote_storage, ctx) + Tenant::spawn_load( + conf, + tenant_id, + broker_client, + remote_storage, + init_order, + ctx, + ) }; Ok(tenant) } @@ -222,6 +233,7 @@ pub fn schedule_local_tenant_processing( /// That could be easily misinterpreted by control plane, the consumer of the /// management API. For example, it could attach the tenant on a different pageserver. /// We would then be in split-brain once this pageserver restarts. +#[instrument] pub async fn shutdown_all_tenants() { // Prevent new tenants from being created. let tenants_to_shut_down = { @@ -238,39 +250,51 @@ pub async fn shutdown_all_tenants() { tenants_clone } TenantsMap::ShuttingDown(_) => { + // TODO: it is possible that detach and shutdown happen at the same time. as a + // result, during shutdown we do not wait for detach. error!("already shutting down, this function isn't supposed to be called more than once"); return; } } }; - let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len()); - for (_, tenant) in tenants_to_shut_down { - if tenant.is_active() { - // updates tenant state, forbidding new GC and compaction iterations from starting - tenant.set_stopping().await; - tenants_to_freeze_and_flush.push(tenant); + let mut join_set = JoinSet::new(); + for (tenant_id, tenant) in tenants_to_shut_down { + join_set.spawn( + async move { + let freeze_and_flush = true; + + match tenant.shutdown(freeze_and_flush).await { + Ok(()) => debug!("tenant successfully stopped"), + Err(super::ShutdownError::AlreadyStopping) => { + warn!("tenant was already shutting down") + } + } + } + .instrument(info_span!("shutdown", %tenant_id)), + ); + } + + let mut panicked = 0; + + while let Some(res) = join_set.join_next().await { + match res { + Ok(()) => {} + Err(join_error) if join_error.is_cancelled() => { + unreachable!("we are not cancelling any of the futures"); + } + Err(join_error) if join_error.is_panic() => { + // cannot really do anything, as this panic is likely a bug + panicked += 1; + } + Err(join_error) => { + warn!("unknown kind of JoinError: {join_error}"); + } } } - // Shut down all existing walreceiver connections and stop accepting the new ones. - task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await; - - // Ok, no background tasks running anymore. Flush any remaining data in - // memory to disk. - // - // We assume that any incoming connections that might request pages from - // the tenant have already been terminated by the caller, so there - // should be no more activity in any of the repositories. - // - // On error, log it but continue with the shutdown for other tenants. - for tenant in tenants_to_freeze_and_flush { - let tenant_id = tenant.tenant_id(); - debug!("shutdown tenant {tenant_id}"); - - if let Err(err) = tenant.freeze_and_flush().await { - error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}"); - } + if panicked > 0 { + warn!(panicked, "observed panicks while shutting down tenants"); } } @@ -291,7 +315,7 @@ pub async fn create_tenant( // See https://github.com/neondatabase/neon/issues/4233 let created_tenant = - schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, ctx)?; + schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 @@ -304,11 +328,19 @@ pub async fn create_tenant( }).await } +#[derive(Debug, thiserror::Error)] +pub enum SetNewTenantConfigError { + #[error(transparent)] + GetTenant(#[from] GetTenantError), + #[error(transparent)] + Persist(anyhow::Error), +} + pub async fn set_new_tenant_config( conf: &'static PageServerConf, new_tenant_conf: TenantConfOpt, tenant_id: TenantId, -) -> Result<(), TenantStateError> { +) -> Result<(), SetNewTenantConfigError> { info!("configuring tenant {tenant_id}"); let tenant = get_tenant(tenant_id, true).await?; @@ -318,23 +350,32 @@ pub async fn set_new_tenant_config( &tenant_config_path, new_tenant_conf, false, - )?; + ) + .map_err(SetNewTenantConfigError::Persist)?; tenant.set_new_tenant_config(new_tenant_conf).await; Ok(()) } +#[derive(Debug, thiserror::Error)] +pub enum GetTenantError { + #[error("Tenant {0} not found")] + NotFound(TenantId), + #[error("Tenant {0} is not active")] + NotActive(TenantId), +} + /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. pub async fn get_tenant( tenant_id: TenantId, active_only: bool, -) -> Result, TenantStateError> { +) -> Result, GetTenantError> { let m = TENANTS.read().await; let tenant = m .get(&tenant_id) - .ok_or(TenantStateError::NotFound(tenant_id))?; + .ok_or(GetTenantError::NotFound(tenant_id))?; if active_only && !tenant.is_active() { - Err(TenantStateError::NotActive(tenant_id)) + Err(GetTenantError::NotActive(tenant_id)) } else { Ok(Arc::clone(tenant)) } @@ -343,7 +384,7 @@ pub async fn get_tenant( #[derive(Debug, thiserror::Error)] pub enum DeleteTimelineError { #[error("Tenant {0}")] - Tenant(#[from] TenantStateError), + Tenant(#[from] GetTenantError), #[error("Timeline {0}")] Timeline(#[from] crate::tenant::DeleteTimelineError), @@ -420,7 +461,7 @@ pub async fn load_tenant( .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?; } - let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, ctx) + let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx) .with_context(|| { format!("Failed to schedule tenant processing in path {tenant_path:?}") })?; @@ -493,7 +534,7 @@ pub async fn attach_tenant( .context("check for attach marker file existence")?; anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file"); - let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), ctx)?; + let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 @@ -569,26 +610,26 @@ where // The exclusive lock here ensures we don't miss the tenant state updates before trying another removal. // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to // avoid holding the lock for the entire process. - { - let tenants_accessor = TENANTS.write().await; - match tenants_accessor.get(&tenant_id) { - Some(tenant) => match tenant.current_state() { - TenantState::Attaching - | TenantState::Loading - | TenantState::Activating - | TenantState::Broken { .. } - | TenantState::Active => tenant.set_stopping().await, - TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)), - }, - None => return Err(TenantStateError::NotFound(tenant_id)), + let tenant = { + TENANTS + .write() + .await + .get(&tenant_id) + .cloned() + .ok_or(TenantStateError::NotFound(tenant_id))? + }; + + let freeze_and_flush = false; + + // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so + // that we can continue safely to cleanup. + match tenant.shutdown(freeze_and_flush).await { + Ok(()) => {} + Err(super::ShutdownError::AlreadyStopping) => { + return Err(TenantStateError::IsStopping(tenant_id)) } } - // shutdown all tenant and timeline tasks: gc, compaction, page service) - // No new tasks will be started for this tenant because it's in `Stopping` state. - // Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely. - task_mgr::shutdown_tasks(None, Some(tenant_id), None).await; - match tenant_cleanup .await .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}")) @@ -670,7 +711,6 @@ pub async fn immediate_gc( Ok(wait_task_done) } -#[cfg(feature = "testing")] pub async fn immediate_compact( tenant_id: TenantId, timeline_id: TimelineId, diff --git a/pageserver/src/tenant/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs index 0b0217ab58..3cbcfe8774 100644 --- a/pageserver/src/tenant/par_fsync.rs +++ b/pageserver/src/tenant/par_fsync.rs @@ -19,14 +19,8 @@ fn parallel_worker(paths: &[PathBuf], next_path_idx: &AtomicUsize) -> io::Result Ok(()) } -pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> { - const PARALLEL_PATH_THRESHOLD: usize = 1; - if paths.len() <= PARALLEL_PATH_THRESHOLD { - for path in paths { - fsync_path(path)?; - } - return Ok(()); - } +fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> { + // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything. /// Use at most this number of threads. /// Increasing this limit will @@ -36,11 +30,11 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> { let num_threads = paths.len().min(MAX_NUM_THREADS); let next_path_idx = AtomicUsize::new(0); - crossbeam_utils::thread::scope(|s| -> io::Result<()> { + std::thread::scope(|s| -> io::Result<()> { let mut handles = vec![]; // Spawn `num_threads - 1`, as the current thread is also a worker. for _ in 1..num_threads { - handles.push(s.spawn(|_| parallel_worker(paths, &next_path_idx))); + handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx))); } parallel_worker(paths, &next_path_idx)?; @@ -51,5 +45,41 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> { Ok(()) }) - .unwrap() +} + +/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool. +pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> { + if paths.len() == 1 { + fsync_path(&paths[0])?; + return Ok(()); + } + + fsync_in_thread_pool(paths) +} + +/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current +/// execution thread. Otherwise, we will spawn_blocking and run it in tokio. +pub async fn par_fsync_async(paths: &[PathBuf]) -> io::Result<()> { + const MAX_CONCURRENT_FSYNC: usize = 64; + let mut next = paths.iter().peekable(); + let mut js = tokio::task::JoinSet::new(); + loop { + while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() { + let next = next.next().expect("just peeked"); + let next = next.to_owned(); + js.spawn_blocking(move || fsync_path(&next)); + } + + // now the joinset has been filled up, wait for next to complete + if let Some(res) = js.join_next().await { + res??; + } else { + // last item had already completed + assert!( + next.peek().is_none(), + "joinset emptied, we shouldn't have more work" + ); + return Ok(()); + } + } } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index d30d6c5c6e..7c071463de 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -4,6 +4,7 @@ pub mod delta_layer; mod filename; mod image_layer; mod inmemory_layer; +mod layer_desc; mod remote_layer; use crate::config::PageServerConf; @@ -37,6 +38,7 @@ pub use delta_layer::{DeltaLayer, DeltaLayerWriter}; pub use filename::{DeltaFileName, ImageFileName, LayerFileName}; pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; +pub use layer_desc::PersistentLayerDesc; pub use remote_layer::RemoteLayer; use super::layer_map::BatchedUpdates; @@ -406,14 +408,23 @@ pub type LayerKeyIter<'i> = Box + 'i>; /// An image layer is a snapshot of all the data in a key-range, at a single /// LSN. pub trait PersistentLayer: Layer { - fn get_tenant_id(&self) -> TenantId; + /// Get the layer descriptor. + fn layer_desc(&self) -> &PersistentLayerDesc; + + fn get_tenant_id(&self) -> TenantId { + self.layer_desc().tenant_id + } /// Identify the timeline this layer belongs to - fn get_timeline_id(&self) -> TimelineId; + fn get_timeline_id(&self) -> TimelineId { + self.layer_desc().timeline_id + } /// File name used for this layer, both in the pageserver's local filesystem /// state as well as in the remote storage. - fn filename(&self) -> LayerFileName; + fn filename(&self) -> LayerFileName { + self.layer_desc().filename() + } // Path to the layer file in the local filesystem. // `None` for `RemoteLayer`. @@ -542,7 +553,7 @@ impl From for LayerDescriptor { /// /// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the /// global config, and paths to layer files are constructed using the tenant/timeline -/// path from the config. But in the 'pageserver_binutils' binary, we need to construct a Layer +/// path from the config. But in the 'pagectl' binary, we need to construct a Layer /// struct for a file on disk, without having a page server running, so that we have no /// config. In that case, we use the Path variant to hold the full path to the file on /// disk. diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index ba3ab6dd4c..5f2fb1ebea 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -56,8 +56,8 @@ use utils::{ }; use super::{ - DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter, - LayerKeyIter, PathOrConf, + DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter, + PathOrConf, PersistentLayerDesc, }; /// @@ -89,10 +89,10 @@ impl From<&DeltaLayer> for Summary { magic: DELTA_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenant_id: layer.tenant_id, - timeline_id: layer.timeline_id, - key_range: layer.key_range.clone(), - lsn_range: layer.lsn_range.clone(), + tenant_id: layer.desc.tenant_id, + timeline_id: layer.desc.timeline_id, + key_range: layer.desc.key_range.clone(), + lsn_range: layer.desc.lsn_range.clone(), index_start_blk: 0, index_root_blk: 0, @@ -110,7 +110,7 @@ const WILL_INIT: u64 = 1; /// reading/deserializing records themselves. /// #[derive(Debug, Serialize, Deserialize, Copy, Clone)] -struct BlobRef(u64); +pub struct BlobRef(pub u64); impl BlobRef { pub fn will_init(&self) -> bool { @@ -180,10 +180,7 @@ impl DeltaKey { pub struct DeltaLayer { path_or_conf: PathOrConf, - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub key_range: Range, - pub lsn_range: Range, + pub desc: PersistentLayerDesc, pub file_size: u64, @@ -197,8 +194,8 @@ impl std::fmt::Debug for DeltaLayer { use super::RangeDisplayDebug; f.debug_struct("DeltaLayer") - .field("key_range", &RangeDisplayDebug(&self.key_range)) - .field("lsn_range", &self.lsn_range) + .field("key_range", &RangeDisplayDebug(&self.desc.key_range)) + .field("lsn_range", &self.desc.lsn_range) .field("file_size", &self.file_size) .field("inner", &self.inner) .finish() @@ -228,30 +225,16 @@ impl std::fmt::Debug for DeltaLayerInner { } impl Layer for DeltaLayer { - fn get_key_range(&self) -> Range { - self.key_range.clone() - } - - fn get_lsn_range(&self) -> Range { - self.lsn_range.clone() - } - fn is_incremental(&self) -> bool { - true - } - - fn short_id(&self) -> String { - self.filename().file_name() - } /// debugging function to print out the contents of the layer fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { println!( "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----", - self.tenant_id, - self.timeline_id, - self.key_range.start, - self.key_range.end, - self.lsn_range.start, - self.lsn_range.end + self.desc.tenant_id, + self.desc.timeline_id, + self.desc.key_range.start, + self.desc.key_range.end, + self.desc.lsn_range.start, + self.desc.lsn_range.end ); if !verbose { @@ -324,10 +307,10 @@ impl Layer for DeltaLayer { reconstruct_state: &mut ValueReconstructState, ctx: &RequestContext, ) -> anyhow::Result { - ensure!(lsn_range.start >= self.lsn_range.start); + ensure!(lsn_range.start >= self.desc.lsn_range.start); let mut need_image = true; - ensure!(self.key_range.contains(&key)); + ensure!(self.desc.key_range.contains(&key)); { // Open the file and lock the metadata in memory @@ -402,19 +385,31 @@ impl Layer for DeltaLayer { Ok(ValueReconstructResult::Complete) } } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn get_key_range(&self) -> Range { + self.layer_desc().key_range.clone() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn get_lsn_range(&self) -> Range { + self.layer_desc().lsn_range.clone() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn is_incremental(&self) -> bool { + self.layer_desc().is_incremental + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn short_id(&self) -> String { + self.layer_desc().short_id() + } } impl PersistentLayer for DeltaLayer { - fn get_tenant_id(&self) -> TenantId { - self.tenant_id - } - - fn get_timeline_id(&self) -> TimelineId { - self.timeline_id - } - - fn filename(&self) -> LayerFileName { - self.layer_name().into() + fn layer_desc(&self) -> &PersistentLayerDesc { + &self.desc } fn local_path(&self) -> Option { @@ -602,10 +597,12 @@ impl DeltaLayer { ) -> DeltaLayer { DeltaLayer { path_or_conf: PathOrConf::Conf(conf), - timeline_id, - tenant_id, - key_range: filename.key_range.clone(), - lsn_range: filename.lsn_range.clone(), + desc: PersistentLayerDesc::new_delta( + tenant_id, + timeline_id, + filename.key_range.clone(), + filename.lsn_range.clone(), + ), file_size, access_stats, inner: RwLock::new(DeltaLayerInner { @@ -619,7 +616,7 @@ impl DeltaLayer { /// Create a DeltaLayer struct representing an existing file on disk. /// - /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. + /// This variant is only used for debugging purposes, by the 'pagectl' binary. pub fn new_for_path(path: &Path, file: File) -> Result { let mut summary_buf = Vec::new(); summary_buf.resize(PAGE_SZ, 0); @@ -632,10 +629,12 @@ impl DeltaLayer { Ok(DeltaLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), - timeline_id: summary.timeline_id, - tenant_id: summary.tenant_id, - key_range: summary.key_range, - lsn_range: summary.lsn_range, + desc: PersistentLayerDesc::new_delta( + summary.tenant_id, + summary.timeline_id, + summary.key_range, + summary.lsn_range, + ), file_size: metadata.len(), access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(DeltaLayerInner { @@ -648,18 +647,14 @@ impl DeltaLayer { } fn layer_name(&self) -> DeltaFileName { - DeltaFileName { - key_range: self.key_range.clone(), - lsn_range: self.lsn_range.clone(), - } + self.desc.delta_file_name() } - /// Path to the layer file in pageserver workdir. pub fn path(&self) -> PathBuf { Self::path_for( &self.path_or_conf, - self.timeline_id, - self.tenant_id, + self.desc.timeline_id, + self.desc.tenant_id, &self.layer_name(), ) } @@ -803,10 +798,12 @@ impl DeltaLayerWriterInner { // set inner.file here. The first read will have to re-open it. let layer = DeltaLayer { path_or_conf: PathOrConf::Conf(self.conf), - tenant_id: self.tenant_id, - timeline_id: self.timeline_id, - key_range: self.key_start..key_end, - lsn_range: self.lsn_range.clone(), + desc: PersistentLayerDesc::new_delta( + self.tenant_id, + self.timeline_id, + self.key_start..key_end, + self.lsn_range.clone(), + ), file_size: metadata.len(), access_stats: LayerAccessStats::empty_will_record_residence_event_later(), inner: RwLock::new(DeltaLayerInner { diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs index e2112fc388..5dcd54689e 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/filename.rs @@ -9,6 +9,8 @@ use std::str::FromStr; use utils::lsn::Lsn; +use super::PersistentLayerDesc; + // Note: Timeline::load_layer_map() relies on this sort order #[derive(PartialEq, Eq, Clone, Hash)] pub struct DeltaFileName { @@ -153,7 +155,7 @@ impl Ord for ImageFileName { impl ImageFileName { pub fn lsn_as_range(&self) -> Range { // Saves from having to copypaste this all over - self.lsn..(self.lsn + 1) + PersistentLayerDesc::image_layer_lsn_range(self.lsn) } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index d298b3e852..b55dd08a6d 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -52,8 +52,8 @@ use utils::{ lsn::Lsn, }; -use super::filename::{ImageFileName, LayerFileName}; -use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf}; +use super::filename::ImageFileName; +use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc}; /// /// Header stored in the beginning of the file @@ -84,9 +84,9 @@ impl From<&ImageLayer> for Summary { Self { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenant_id: layer.tenant_id, - timeline_id: layer.timeline_id, - key_range: layer.key_range.clone(), + tenant_id: layer.desc.tenant_id, + timeline_id: layer.desc.timeline_id, + key_range: layer.desc.key_range.clone(), lsn: layer.lsn, index_start_blk: 0, @@ -104,14 +104,13 @@ impl From<&ImageLayer> for Summary { /// and it needs to be loaded before using it in queries. pub struct ImageLayer { path_or_conf: PathOrConf, - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub key_range: Range, - pub file_size: u64, - // This entry contains an image of all pages as of this LSN + pub desc: PersistentLayerDesc, + // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn pub lsn: Lsn, + pub file_size: u64, + access_stats: LayerAccessStats, inner: RwLock, @@ -122,7 +121,7 @@ impl std::fmt::Debug for ImageLayer { use super::RangeDisplayDebug; f.debug_struct("ImageLayer") - .field("key_range", &RangeDisplayDebug(&self.key_range)) + .field("key_range", &RangeDisplayDebug(&self.desc.key_range)) .field("file_size", &self.file_size) .field("lsn", &self.lsn) .field("inner", &self.inner) @@ -153,27 +152,15 @@ impl std::fmt::Debug for ImageLayerInner { } impl Layer for ImageLayer { - fn get_key_range(&self) -> Range { - self.key_range.clone() - } - - fn get_lsn_range(&self) -> Range { - // End-bound is exclusive - self.lsn..(self.lsn + 1) - } - fn is_incremental(&self) -> bool { - false - } - - fn short_id(&self) -> String { - self.filename().file_name() - } - /// debugging function to print out the contents of the layer fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { println!( "----- image layer for ten {} tli {} key {}-{} at {} ----", - self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn + self.desc.tenant_id, + self.desc.timeline_id, + self.desc.key_range.start, + self.desc.key_range.end, + self.lsn ); if !verbose { @@ -203,7 +190,7 @@ impl Layer for ImageLayer { reconstruct_state: &mut ValueReconstructState, ctx: &RequestContext, ) -> anyhow::Result { - assert!(self.key_range.contains(&key)); + assert!(self.desc.key_range.contains(&key)); assert!(lsn_range.start >= self.lsn); assert!(lsn_range.end >= self.lsn); @@ -230,24 +217,37 @@ impl Layer for ImageLayer { Ok(ValueReconstructResult::Missing) } } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn get_key_range(&self) -> Range { + self.layer_desc().key_range.clone() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn get_lsn_range(&self) -> Range { + self.layer_desc().lsn_range.clone() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn is_incremental(&self) -> bool { + self.layer_desc().is_incremental + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn short_id(&self) -> String { + self.layer_desc().short_id() + } } impl PersistentLayer for ImageLayer { - fn filename(&self) -> LayerFileName { - self.layer_name().into() + fn layer_desc(&self) -> &PersistentLayerDesc { + &self.desc } fn local_path(&self) -> Option { Some(self.path()) } - fn get_tenant_id(&self) -> TenantId { - self.tenant_id - } - - fn get_timeline_id(&self) -> TimelineId { - self.timeline_id - } fn iter(&self, _ctx: &RequestContext) -> Result> { unimplemented!(); } @@ -405,9 +405,13 @@ impl ImageLayer { ) -> ImageLayer { ImageLayer { path_or_conf: PathOrConf::Conf(conf), - timeline_id, - tenant_id, - key_range: filename.key_range.clone(), + desc: PersistentLayerDesc::new_img( + tenant_id, + timeline_id, + filename.key_range.clone(), + filename.lsn, + false, + ), // Now we assume image layer ALWAYS covers the full range. This may change in the future. lsn: filename.lsn, file_size, access_stats, @@ -422,7 +426,7 @@ impl ImageLayer { /// Create an ImageLayer struct representing an existing file on disk. /// - /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. + /// This variant is only used for debugging purposes, by the 'pagectl' binary. pub fn new_for_path(path: &Path, file: File) -> Result { let mut summary_buf = Vec::new(); summary_buf.resize(PAGE_SZ, 0); @@ -433,9 +437,13 @@ impl ImageLayer { .context("get file metadata to determine size")?; Ok(ImageLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), - timeline_id: summary.timeline_id, - tenant_id: summary.tenant_id, - key_range: summary.key_range, + desc: PersistentLayerDesc::new_img( + summary.tenant_id, + summary.timeline_id, + summary.key_range, + summary.lsn, + false, + ), // Now we assume image layer ALWAYS covers the full range. This may change in the future. lsn: summary.lsn, file_size: metadata.len(), access_stats: LayerAccessStats::empty_will_record_residence_event_later(), @@ -449,18 +457,15 @@ impl ImageLayer { } fn layer_name(&self) -> ImageFileName { - ImageFileName { - key_range: self.key_range.clone(), - lsn: self.lsn, - } + self.desc.image_file_name() } /// Path to the layer file in pageserver workdir. pub fn path(&self) -> PathBuf { Self::path_for( &self.path_or_conf, - self.timeline_id, - self.tenant_id, + self.desc.timeline_id, + self.desc.tenant_id, &self.layer_name(), ) } @@ -484,6 +489,7 @@ struct ImageLayerWriterInner { tenant_id: TenantId, key_range: Range, lsn: Lsn, + is_incremental: bool, blob_writer: WriteBlobWriter, tree: DiskBtreeBuilder, @@ -499,6 +505,7 @@ impl ImageLayerWriterInner { tenant_id: TenantId, key_range: &Range, lsn: Lsn, + is_incremental: bool, ) -> anyhow::Result { // Create the file initially with a temporary filename. // We'll atomically rename it to the final name when we're done. @@ -533,6 +540,7 @@ impl ImageLayerWriterInner { lsn, tree: tree_builder, blob_writer, + is_incremental, }; Ok(writer) @@ -570,6 +578,14 @@ impl ImageLayerWriterInner { file.write_all(buf.as_ref())?; } + let desc = PersistentLayerDesc::new_img( + self.tenant_id, + self.timeline_id, + self.key_range.clone(), + self.lsn, + self.is_incremental, // for now, image layer ALWAYS covers the full range + ); + // Fill in the summary on blk 0 let summary = Summary { magic: IMAGE_FILE_MAGIC, @@ -593,9 +609,7 @@ impl ImageLayerWriterInner { // set inner.file here. The first read will have to re-open it. let layer = ImageLayer { path_or_conf: PathOrConf::Conf(self.conf), - timeline_id: self.timeline_id, - tenant_id: self.tenant_id, - key_range: self.key_range.clone(), + desc, lsn: self.lsn, file_size: metadata.len(), access_stats: LayerAccessStats::empty_will_record_residence_event_later(), @@ -667,6 +681,7 @@ impl ImageLayerWriter { tenant_id: TenantId, key_range: &Range, lsn: Lsn, + is_incremental: bool, ) -> anyhow::Result { Ok(Self { inner: Some(ImageLayerWriterInner::new( @@ -675,6 +690,7 @@ impl ImageLayerWriter { tenant_id, key_range, lsn, + is_incremental, )?), }) } diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs new file mode 100644 index 0000000000..a9859681d3 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -0,0 +1,109 @@ +use std::ops::Range; +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +use crate::repository::Key; + +use super::{DeltaFileName, ImageFileName, LayerFileName}; + +/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the +/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides +/// a unified way to generate layer information like file name. +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct PersistentLayerDesc { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub key_range: Range, + /// For image layer, this is `[lsn, lsn+1)`. + pub lsn_range: Range, + /// Whether this is a delta layer. + pub is_delta: bool, + /// Whether this layer only contains page images for part of the keys in the range. In the current implementation, this should + /// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be + /// incremental. + pub is_incremental: bool, +} + +impl PersistentLayerDesc { + pub fn short_id(&self) -> String { + self.filename().file_name() + } + + pub fn new_img( + tenant_id: TenantId, + timeline_id: TimelineId, + key_range: Range, + lsn: Lsn, + is_incremental: bool, + ) -> Self { + Self { + tenant_id, + timeline_id, + key_range, + lsn_range: Self::image_layer_lsn_range(lsn), + is_delta: false, + is_incremental, + } + } + + pub fn new_delta( + tenant_id: TenantId, + timeline_id: TimelineId, + key_range: Range, + lsn_range: Range, + ) -> Self { + Self { + tenant_id, + timeline_id, + key_range, + lsn_range, + is_delta: true, + is_incremental: true, + } + } + + /// Get the LSN that the image layer covers. + pub fn image_layer_lsn(&self) -> Lsn { + assert!(!self.is_delta); + assert!(self.lsn_range.start + 1 == self.lsn_range.end); + self.lsn_range.start + } + + /// Get the LSN range corresponding to a single image layer LSN. + pub fn image_layer_lsn_range(lsn: Lsn) -> Range { + lsn..(lsn + 1) + } + + /// Get a delta file name for this layer. + /// + /// Panic: if this is not a delta layer. + pub fn delta_file_name(&self) -> DeltaFileName { + assert!(self.is_delta); + DeltaFileName { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), + } + } + + /// Get a delta file name for this layer. + /// + /// Panic: if this is not an image layer, or the lsn range is invalid + pub fn image_file_name(&self) -> ImageFileName { + assert!(!self.is_delta); + assert!(self.lsn_range.start + 1 == self.lsn_range.end); + ImageFileName { + key_range: self.key_range.clone(), + lsn: self.lsn_range.start, + } + } + + pub fn filename(&self) -> LayerFileName { + if self.is_delta { + self.delta_file_name().into() + } else { + self.image_file_name().into() + } + } +} diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 2106587ab2..ff0f44da92 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -18,11 +18,10 @@ use utils::{ lsn::Lsn, }; -use super::filename::{DeltaFileName, ImageFileName, LayerFileName}; -use super::image_layer::ImageLayer; +use super::filename::{DeltaFileName, ImageFileName}; use super::{ - DeltaLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter, - LayerResidenceStatus, PersistentLayer, + DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter, + LayerResidenceStatus, PersistentLayer, PersistentLayerDesc, }; /// RemoteLayer is a not yet downloaded [`ImageLayer`] or @@ -34,19 +33,10 @@ use super::{ /// /// See: [`crate::context::RequestContext`] for authorization to download pub struct RemoteLayer { - tenantid: TenantId, - timelineid: TimelineId, - key_range: Range, - lsn_range: Range, - - pub file_name: LayerFileName, + pub desc: PersistentLayerDesc, pub layer_metadata: LayerFileMetadata, - is_delta: bool, - - is_incremental: bool, - access_stats: LayerAccessStats, pub(crate) ongoing_download: Arc, @@ -66,22 +56,14 @@ pub struct RemoteLayer { impl std::fmt::Debug for RemoteLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("RemoteLayer") - .field("file_name", &self.file_name) + .field("file_name", &self.desc.filename()) .field("layer_metadata", &self.layer_metadata) - .field("is_incremental", &self.is_incremental) + .field("is_incremental", &self.desc.is_incremental) .finish() } } impl Layer for RemoteLayer { - fn get_key_range(&self) -> Range { - self.key_range.clone() - } - - fn get_lsn_range(&self) -> Range { - self.lsn_range.clone() - } - fn get_value_reconstruct_data( &self, _key: Key, @@ -95,53 +77,45 @@ impl Layer for RemoteLayer { ); } - fn is_incremental(&self) -> bool { - self.is_incremental - } - /// debugging function to print out the contents of the layer fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> { println!( "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----", - self.tenantid, - self.timelineid, - self.key_range.start, - self.key_range.end, - self.lsn_range.start, - self.lsn_range.end + self.desc.tenant_id, + self.desc.timeline_id, + self.desc.key_range.start, + self.desc.key_range.end, + self.desc.lsn_range.start, + self.desc.lsn_range.end ); Ok(()) } + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn get_key_range(&self) -> Range { + self.layer_desc().key_range.clone() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn get_lsn_range(&self) -> Range { + self.layer_desc().lsn_range.clone() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn is_incremental(&self) -> bool { + self.layer_desc().is_incremental + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. fn short_id(&self) -> String { - self.filename().file_name() + self.layer_desc().short_id() } } impl PersistentLayer for RemoteLayer { - fn get_tenant_id(&self) -> TenantId { - self.tenantid - } - - fn get_timeline_id(&self) -> TimelineId { - self.timelineid - } - - fn filename(&self) -> LayerFileName { - if self.is_delta { - DeltaFileName { - key_range: self.key_range.clone(), - lsn_range: self.lsn_range.clone(), - } - .into() - } else { - ImageFileName { - key_range: self.key_range.clone(), - lsn: self.lsn_range.start, - } - .into() - } + fn layer_desc(&self) -> &PersistentLayerDesc { + &self.desc } fn local_path(&self) -> Option { @@ -176,7 +150,7 @@ impl PersistentLayer for RemoteLayer { let layer_file_name = self.filename().file_name(); let lsn_range = self.get_lsn_range(); - if self.is_delta { + if self.desc.is_delta { HistoricLayerInfo::Delta { layer_file_name, layer_file_size: self.layer_metadata.file_size(), @@ -210,13 +184,13 @@ impl RemoteLayer { access_stats: LayerAccessStats, ) -> RemoteLayer { RemoteLayer { - tenantid, - timelineid, - key_range: fname.key_range.clone(), - lsn_range: fname.lsn_as_range(), - is_delta: false, - is_incremental: false, - file_name: fname.to_owned().into(), + desc: PersistentLayerDesc::new_img( + tenantid, + timelineid, + fname.key_range.clone(), + fname.lsn, + false, + ), layer_metadata: layer_metadata.clone(), ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), download_replacement_failure: std::sync::atomic::AtomicBool::default(), @@ -232,13 +206,12 @@ impl RemoteLayer { access_stats: LayerAccessStats, ) -> RemoteLayer { RemoteLayer { - tenantid, - timelineid, - key_range: fname.key_range.clone(), - lsn_range: fname.lsn_range.clone(), - is_delta: true, - is_incremental: true, - file_name: fname.to_owned().into(), + desc: PersistentLayerDesc::new_delta( + tenantid, + timelineid, + fname.key_range.clone(), + fname.lsn_range.clone(), + ), layer_metadata: layer_metadata.clone(), ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), download_replacement_failure: std::sync::atomic::AtomicBool::default(), @@ -256,15 +229,12 @@ impl RemoteLayer { where L: ?Sized + Layer, { - if self.is_delta { - let fname = DeltaFileName { - key_range: self.key_range.clone(), - lsn_range: self.lsn_range.clone(), - }; + if self.desc.is_delta { + let fname = self.desc.delta_file_name(); Arc::new(DeltaLayer::new( conf, - self.timelineid, - self.tenantid, + self.desc.timeline_id, + self.desc.tenant_id, &fname, file_size, self.access_stats.clone_for_residence_change( @@ -273,14 +243,11 @@ impl RemoteLayer { ), )) } else { - let fname = ImageFileName { - key_range: self.key_range.clone(), - lsn: self.lsn_range.start, - }; + let fname = self.desc.image_file_name(); Arc::new(ImageLayer::new( conf, - self.timelineid, - self.tenantid, + self.desc.timeline_id, + self.desc.tenant_id, &fname, file_size, self.access_stats.clone_for_residence_change( diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 6bf26f1da1..360818b5a7 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -9,13 +9,17 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; -use crate::tenant::mgr; use crate::tenant::{Tenant, TenantState}; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::id::TenantId; +use utils::completion; -pub fn start_background_loops(tenant_id: TenantId) { +/// Start per tenant background loops: compaction and gc. +pub fn start_background_loops( + tenant: &Arc, + background_jobs_can_start: Option<&completion::Barrier>, +) { + let tenant_id = tenant.tenant_id; task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, @@ -23,11 +27,20 @@ pub fn start_background_loops(tenant_id: TenantId) { None, &format!("compactor for tenant {tenant_id}"), false, - async move { - compaction_loop(tenant_id) - .instrument(info_span!("compaction_loop", tenant_id = %tenant_id)) - .await; - Ok(()) + { + let tenant = Arc::clone(tenant); + let background_jobs_can_start = background_jobs_can_start.cloned(); + async move { + let cancel = task_mgr::shutdown_token(); + tokio::select! { + _ = cancel.cancelled() => { return Ok(()) }, + _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + }; + compaction_loop(tenant, cancel) + .instrument(info_span!("compaction_loop", tenant_id = %tenant_id)) + .await; + Ok(()) + } }, ); task_mgr::spawn( @@ -37,11 +50,20 @@ pub fn start_background_loops(tenant_id: TenantId) { None, &format!("garbage collector for tenant {tenant_id}"), false, - async move { - gc_loop(tenant_id) - .instrument(info_span!("gc_loop", tenant_id = %tenant_id)) - .await; - Ok(()) + { + let tenant = Arc::clone(tenant); + let background_jobs_can_start = background_jobs_can_start.cloned(); + async move { + let cancel = task_mgr::shutdown_token(); + tokio::select! { + _ = cancel.cancelled() => { return Ok(()) }, + _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + }; + gc_loop(tenant, cancel) + .instrument(info_span!("gc_loop", tenant_id = %tenant_id)) + .await; + Ok(()) + } }, ); } @@ -49,27 +71,26 @@ pub fn start_background_loops(tenant_id: TenantId) { /// /// Compaction task's main loop /// -async fn compaction_loop(tenant_id: TenantId) { +async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { let wait_duration = Duration::from_secs(2); info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { - let cancel = task_mgr::shutdown_token(); let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); let mut first = true; loop { trace!("waking up"); - let tenant = tokio::select! { + tokio::select! { _ = cancel.cancelled() => { info!("received cancellation request"); return; }, - tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { + tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { ControlFlow::Break(()) => return, - ControlFlow::Continue(tenant) => tenant, + ControlFlow::Continue(()) => (), }, - }; + } let period = tenant.get_compaction_period(); @@ -119,29 +140,29 @@ async fn compaction_loop(tenant_id: TenantId) { /// /// GC task's main loop /// -async fn gc_loop(tenant_id: TenantId) { +async fn gc_loop(tenant: Arc, cancel: CancellationToken) { let wait_duration = Duration::from_secs(2); info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { - let cancel = task_mgr::shutdown_token(); // GC might require downloading, to find the cutoff LSN that corresponds to the // cutoff specified as time. - let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + let ctx = + RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); let mut first = true; loop { trace!("waking up"); - let tenant = tokio::select! { + tokio::select! { _ = cancel.cancelled() => { info!("received cancellation request"); return; }, - tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { + tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { ControlFlow::Break(()) => return, - ControlFlow::Continue(tenant) => tenant, + ControlFlow::Continue(()) => (), }, - }; + } let period = tenant.get_gc_period(); @@ -161,7 +182,9 @@ async fn gc_loop(tenant_id: TenantId) { Duration::from_secs(10) } else { // Run gc - let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await; + let res = tenant + .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx) + .await; if let Err(e) = res { error!("Gc failed, retrying in {:?}: {e:?}", wait_duration); wait_duration @@ -187,23 +210,10 @@ async fn gc_loop(tenant_id: TenantId) { trace!("GC loop stopped."); } -async fn wait_for_active_tenant( - tenant_id: TenantId, - wait: Duration, -) -> ControlFlow<(), Arc> { - let tenant = loop { - match mgr::get_tenant(tenant_id, false).await { - Ok(tenant) => break tenant, - Err(e) => { - error!("Failed to get a tenant {tenant_id}: {e:#}"); - tokio::time::sleep(wait).await; - } - } - }; - +async fn wait_for_active_tenant(tenant: &Arc) -> ControlFlow<()> { // if the tenant has a proper status already, no need to wait for anything if tenant.current_state() == TenantState::Active { - ControlFlow::Continue(tenant) + ControlFlow::Continue(()) } else { let mut tenant_state_updates = tenant.subscribe_for_state_updates(); loop { @@ -213,7 +223,7 @@ async fn wait_for_active_tenant( match new_state { TenantState::Active => { debug!("Tenant state changed to active, continuing the task loop"); - return ControlFlow::Continue(tenant); + return ControlFlow::Continue(()); } state => { debug!("Not running the task loop, tenant is not active: {state:?}"); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index ecb01708ac..1d603494ee 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -57,6 +57,7 @@ use pageserver_api::reltag::RelTag; use postgres_connection::PgConnectionConfig; use postgres_ffi::to_pg_timestamp; use utils::{ + completion, id::{TenantId, TimelineId}, lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, @@ -119,7 +120,7 @@ pub struct Timeline { pub pg_version: u32, - pub(crate) layers: tokio::sync::RwLock>, + pub(crate) layers: Arc>>, /// Set of key ranges which should be covered by image layers to /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored. @@ -195,8 +196,9 @@ pub struct Timeline { /// Layer removal lock. /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], - /// and [`Tenant::delete_timeline`]. - pub(super) layer_removal_cs: tokio::sync::Mutex<()>, + /// and [`Tenant::delete_timeline`]. This is an `Arc` lock because we need an owned + /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`). + pub(super) layer_removal_cs: Arc>, // Needed to ensure that we can't create a branch at a point that was already garbage collected pub latest_gc_cutoff_lsn: Rcu, @@ -235,7 +237,18 @@ pub struct Timeline { state: watch::Sender, + /// Prevent two tasks from deleting the timeline at the same time. If held, the + /// timeline is being deleted. If 'true', the timeline has already been deleted. + pub delete_lock: tokio::sync::Mutex, + eviction_task_timeline_state: tokio::sync::Mutex, + + /// Barrier to wait before doing initial logical size calculation. Used only during startup. + initial_logical_size_can_start: Option, + + /// Completion shared between all timelines loaded during startup; used to delay heavier + /// background tasks until some logical sizes have been calculated. + initial_logical_size_attempt: Mutex>, } type LayerMapWriteLockGuard<'t> = tokio::sync::RwLockWriteGuard<'t, LayerMap>; @@ -522,7 +535,12 @@ impl Timeline { Some((cached_lsn, cached_img)) => { match cached_lsn.cmp(&lsn) { Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image + Ordering::Equal => { + self.metrics + .materialized_page_cache_hit_upon_request_counter + .inc(); + return Ok(cached_img); // exact LSN match, return the image + } Ordering::Greater => { unreachable!("the returned lsn should never be after the requested lsn") } @@ -537,8 +555,10 @@ impl Timeline { img: cached_page_img, }; + let timer = self.metrics.get_reconstruct_data_time_histo.start_timer(); self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx) .await?; + timer.stop_and_record(); self.metrics .reconstruct_time_histo @@ -624,7 +644,7 @@ impl Timeline { { Ok(()) => Ok(()), Err(e) => { - // walreceiver.status() locks internally, don't count that towards the wait_lsn_time_histo + // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo drop(_timer); let walreceiver_status = { match &*self.walreceiver.lock().unwrap() { @@ -671,7 +691,7 @@ impl Timeline { } /// Outermost timeline compaction operation; downloads needed layers. - pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> { + pub async fn compact(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { const ROUNDS: usize = 2; let last_record_lsn = self.get_last_record_lsn(); @@ -760,7 +780,7 @@ impl Timeline { } /// Compaction which might need to be retried after downloading remote layers. - async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> { + async fn compact_inner(self: &Arc, ctx: &RequestContext) -> Result<(), CompactionError> { // // High level strategy for compaction / image creation: // @@ -795,7 +815,7 @@ impl Timeline { // Below are functions compact_level0() and create_image_layers() // but they are a bit ad hoc and don't quite work like it's explained // above. Rewrite it. - let layer_removal_cs = self.layer_removal_cs.lock().await; + let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await); // Is the timeline being deleted? let state = *self.state.borrow(); if state == TimelineState::Stopping { @@ -829,7 +849,7 @@ impl Timeline { // 3. Compact let timer = self.metrics.compact_time_histo.start_timer(); - self.compact_level0(&layer_removal_cs, target_file_size, ctx) + self.compact_level0(layer_removal_cs.clone(), target_file_size, ctx) .await?; timer.stop_and_record(); } @@ -918,10 +938,15 @@ impl Timeline { Ok(()) } - pub fn activate(self: &Arc, broker_client: BrokerClientChannel, ctx: &RequestContext) { + pub fn activate( + self: &Arc, + broker_client: BrokerClientChannel, + background_jobs_can_start: Option<&completion::Barrier>, + ctx: &RequestContext, + ) { self.launch_wal_receiver(ctx, broker_client); self.set_state(TimelineState::Active); - self.launch_eviction_task(); + self.launch_eviction_task(background_jobs_can_start); } pub fn set_state(&self, new_state: TimelineState) { @@ -939,6 +964,14 @@ impl Timeline { error!("Not activating a Stopping timeline"); } (_, new_state) => { + if matches!(new_state, TimelineState::Stopping | TimelineState::Broken) { + // drop the copmletion guard, if any; it might be holding off the completion + // forever needlessly + self.initial_logical_size_attempt + .lock() + .unwrap_or_else(|e| e.into_inner()) + .take(); + } self.state.send_replace(new_state); } } @@ -1288,6 +1321,13 @@ impl Timeline { .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold) } + fn get_gc_feedback(&self) -> bool { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .gc_feedback + .unwrap_or(self.conf.default_tenant_conf.gc_feedback) + } + pub(super) fn tenant_conf_updated(&self) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. @@ -1322,6 +1362,8 @@ impl Timeline { walredo_mgr: Arc, remote_client: Option, pg_version: u32, + initial_logical_size_can_start: Option, + initial_logical_size_attempt: Option, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); let (state, _) = watch::channel(TimelineState::Loading); @@ -1346,7 +1388,7 @@ impl Timeline { timeline_id, tenant_id, pg_version, - layers: tokio::sync::RwLock::new(LayerMap::default()), + layers: Arc::new(tokio::sync::RwLock::new(LayerMap::default())), wanted_image_layers: Mutex::new(None), walredo_mgr, @@ -1415,6 +1457,10 @@ impl Timeline { eviction_task_timeline_state: tokio::sync::Mutex::new( EvictionTaskTimelineState::default(), ), + delete_lock: tokio::sync::Mutex::new(false), + + initial_logical_size_can_start, + initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt), }; result.repartition_threshold = result.get_checkpoint_distance() / 10; result @@ -1903,9 +1949,30 @@ impl Timeline { false, // NB: don't log errors here, task_mgr will do that. async move { - // no cancellation here, because nothing really waits for this to complete compared + + let cancel = task_mgr::shutdown_token(); + + // in case we were created during pageserver initialization, wait for + // initialization to complete before proceeding. startup time init runs on the same + // runtime. + tokio::select! { + _ = cancel.cancelled() => { return Ok(()); }, + _ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {} + }; + + // hold off background tasks from starting until all timelines get to try at least + // once initial logical size calculation; though retry will rarely be useful. + // holding off is done because heavier tasks execute blockingly on the same + // runtime. + // + // dropping this at every outcome is probably better than trying to cling on to it, + // delay will be terminated by a timeout regardless. + let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() }; + + // no extra cancellation here, because nothing really waits for this to complete compared // to spawn_ondemand_logical_size_calculation. let cancel = CancellationToken::new(); + let calculated_size = match self_clone .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel) .await @@ -2170,7 +2237,7 @@ impl Timeline { fn delete_historic_layer( &self, // we cannot remove layers otherwise, since gc and compaction will race - _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, + _layer_removal_cs: Arc>, layer: Arc, updates: &mut BatchedUpdates<'_, dyn PersistentLayer>, ) -> anyhow::Result<()> { @@ -2249,6 +2316,9 @@ impl Timeline { let mut timeline_owned; let mut timeline = self; + let mut read_count = + scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64)); + // For debugging purposes, collect the path of layers that we traversed // through. It's included in the error message if we fail to find the key. let mut traversal_path = Vec::::new(); @@ -2383,6 +2453,7 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; + // metrics: open_layer does not count as fs access, so we are not updating `read_count` traversal_path.push(( result, cont_lsn, @@ -2409,6 +2480,7 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; + // metrics: open_layer does not count as fs access, so we are not updating `read_count` traversal_path.push(( result, cont_lsn, @@ -2443,6 +2515,7 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; + *read_count += 1; traversal_path.push(( result, cont_lsn, @@ -2508,7 +2581,7 @@ impl Timeline { (DownloadBehavior::Error, false) => { return Err(PageReconstructError::NeedsDownload( TenantTimelineId::new(self.tenant_id, self.timeline_id), - remote_layer.file_name.clone(), + remote_layer.filename(), )) } } @@ -2654,7 +2727,7 @@ impl Timeline { /// Layer flusher task's main loop. async fn flush_loop( - &self, + self: &Arc, mut layer_flush_start_rx: tokio::sync::watch::Receiver, ctx: &RequestContext, ) { @@ -2743,9 +2816,9 @@ impl Timeline { } /// Flush one frozen in-memory layer to disk, as a new delta layer. - #[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))] async fn flush_frozen_layer( - &self, + self: &Arc, frozen_layer: Arc, ctx: &RequestContext, ) -> anyhow::Result<()> { @@ -2869,26 +2942,41 @@ impl Timeline { // Write out the given frozen in-memory layer as a new L0 delta file async fn create_delta_layer( - &self, + self: &Arc, frozen_layer: &InMemoryLayer, ) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> { - // Write it out - let new_delta = frozen_layer.write_to_disk()?; - let new_delta_path = new_delta.path(); - let new_delta_filename = new_delta.filename(); + // TODO figure out how to use spawn_blocking. Can't use it because frozen_layer is not 'static + let (new_delta, sz): (DeltaLayer, _) = tokio::task::block_in_place({ + let self_clone = Arc::clone(self); + move || { + // Write it out + let new_delta = frozen_layer.write_to_disk()?; + let new_delta_path = new_delta.path(); - // Sync it to disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // TODO: If we're running inside 'flush_frozen_layers' and there are multiple - // files to flush, it might be better to first write them all, and then fsync - // them all in parallel. - par_fsync::par_fsync(&[ - new_delta_path.clone(), - self.conf.timeline_path(&self.timeline_id, &self.tenant_id), - ])?; + // Sync it to disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // TODO: If we're running inside 'flush_frozen_layers' and there are multiple + // files to flush, it might be better to first write them all, and then fsync + // them all in parallel. + + // First sync the delta layer. We still use par_fsync here to keep everything consistent. Feel free to replace + // this with a single fsync in future refactors. + par_fsync::par_fsync(&[new_delta_path.clone()]).context("fsync of delta layer")?; + // Then sync the parent directory. + par_fsync::par_fsync(&[self_clone + .conf + .timeline_path(&self_clone.timeline_id, &self_clone.tenant_id)]) + .context("fsync of timeline dir")?; + + let sz = new_delta_path.metadata()?.len(); + + anyhow::Ok((new_delta, sz)) + } + })?; + let new_delta_name = new_delta.filename(); // Add it to the layer map let l = Arc::new(new_delta); @@ -2903,14 +2991,12 @@ impl Timeline { batch_updates.flush(); // update the timeline's physical size - let sz = new_delta_path.metadata()?.len(); - self.metrics.resident_physical_size_gauge.add(sz); // update metrics self.metrics.num_persistent_files_created.inc_by(1); self.metrics.persistent_bytes_written.inc_by(sz); - Ok((new_delta_filename, LayerFileMetadata::new(sz))) + Ok((new_delta_name, LayerFileMetadata::new(sz))) } async fn repartition( @@ -3053,6 +3139,7 @@ impl Timeline { self.tenant_id, &img_range, lsn, + false, // image layer always covers the full range )?; fail_point!("image-layer-writer-fail-before-finish", |_| { @@ -3116,17 +3203,22 @@ impl Timeline { let all_paths = image_layers .iter() .map(|layer| layer.path()) - .chain(std::iter::once( - self.conf.timeline_path(&self.timeline_id, &self.tenant_id), - )) .collect::>(); - par_fsync::par_fsync(&all_paths).context("fsync of newly created layer files")?; + + par_fsync::par_fsync_async(&all_paths) + .await + .context("fsync of newly created layer files")?; + + par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)]) + .await + .context("fsync of timeline dir")?; let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len()); let mut layers = self.layers.write().await; let mut updates = layers.batch_update(); let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + for l in image_layers { let path = l.filename(); let metadata = timeline_path @@ -3185,13 +3277,13 @@ impl Timeline { /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the /// start of level0 files compaction, the on-demand download should be revisited as well. - async fn compact_level0_phase1( + fn compact_level0_phase1( &self, - _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, + _layer_removal_cs: Arc>, + layers: tokio::sync::OwnedRwLockReadGuard>, target_file_size: u64, ctx: &RequestContext, ) -> Result { - let layers = self.layers.read().await; let mut level0_deltas = layers.get_level0_deltas()?; // Only compact if enough layers have accumulated. @@ -3498,13 +3590,13 @@ impl Timeline { if !new_layers.is_empty() { let mut layer_paths: Vec = new_layers.iter().map(|l| l.path()).collect(); - // also sync the directory - layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); - // Fsync all the layer files and directory using multiple threads to // minimize latency. par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?; + par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)]) + .context("fsync of timeline dir")?; + layer_paths.pop().unwrap(); } @@ -3521,17 +3613,27 @@ impl Timeline { /// as Level 1 files. /// async fn compact_level0( - &self, - layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, + self: &Arc, + layer_removal_cs: Arc>, target_file_size: u64, ctx: &RequestContext, ) -> Result<(), CompactionError> { + let this = self.clone(); + let ctx_inner = ctx.clone(); + let layer_removal_cs_inner = layer_removal_cs.clone(); + let layers = Arc::clone(&self.layers).read_owned().await; + let span = tracing::info_span!("blocking"); let CompactLevel0Phase1Result { new_layers, deltas_to_compact, - } = self - .compact_level0_phase1(layer_removal_cs, target_file_size, ctx) - .await?; + } = tokio::task::spawn_blocking(move || { + let _g = span.entered(); + this.compact_level0_phase1(layer_removal_cs_inner, layers, target_file_size, &ctx_inner) + }) + .await + .context("compact_level0_phase1 spawn_blocking") + .map_err(CompactionError::Other) + .and_then(|res| res)?; if new_layers.is_empty() && deltas_to_compact.is_empty() { // nothing to do @@ -3589,7 +3691,7 @@ impl Timeline { let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len()); for l in deltas_to_compact { layer_names_to_delete.push(l.filename()); - self.delete_historic_layer(layer_removal_cs, l, &mut updates)?; + self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates)?; } updates.flush(); drop(layers); @@ -3709,10 +3811,11 @@ impl Timeline { fail_point!("before-timeline-gc"); - let layer_removal_cs = self.layer_removal_cs.lock().await; + let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await); // Is the timeline being deleted? let state = *self.state.borrow(); if state == TimelineState::Stopping { + // there's a global allowed_error for this anyhow::bail!("timeline is Stopping"); } @@ -3729,7 +3832,7 @@ impl Timeline { let res = self .gc_timeline( - &layer_removal_cs, + layer_removal_cs.clone(), horizon_cutoff, pitr_cutoff, retain_lsns, @@ -3748,7 +3851,7 @@ impl Timeline { async fn gc_timeline( &self, - layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, + layer_removal_cs: Arc>, horizon_cutoff: Lsn, pitr_cutoff: Lsn, retain_lsns: Vec, @@ -3888,7 +3991,7 @@ impl Timeline { // delta layers. Image layers can form "stairs" preventing old image from been deleted. // But image layers are in any case less sparse than delta layers. Also we need some // protection from replacing recent image layers with new one after each GC iteration. - if l.is_incremental() && !LayerMap::is_l0(&*l) { + if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&*l) { wanted_image_layers.add_range(l.get_key_range()); } result.layers_not_updated += 1; @@ -3921,7 +4024,11 @@ impl Timeline { { for doomed_layer in layers_to_remove { layer_names_to_delete.push(doomed_layer.filename()); - self.delete_historic_layer(layer_removal_cs, doomed_layer, &mut updates)?; // FIXME: schedule succeeded deletions before returning? + self.delete_historic_layer( + layer_removal_cs.clone(), + doomed_layer, + &mut updates, + )?; // FIXME: schedule succeeded deletions before returning? result.layers_removed += 1; } } @@ -4093,7 +4200,7 @@ impl Timeline { // Does retries + exponential back-off internally. // When this fails, don't layer further retry attempts here. let result = remote_client - .download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata) + .download_layer_file(&remote_layer.filename(), &remote_layer.layer_metadata) .await; if let Ok(size) = &result { diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index a7f24c52ed..80c5210211 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -34,6 +34,8 @@ use crate::{ }, }; +use utils::completion; + use super::Timeline; #[derive(Default)] @@ -47,8 +49,12 @@ pub struct EvictionTaskTenantState { } impl Timeline { - pub(super) fn launch_eviction_task(self: &Arc) { + pub(super) fn launch_eviction_task( + self: &Arc, + background_tasks_can_start: Option<&completion::Barrier>, + ) { let self_clone = Arc::clone(self); + let background_tasks_can_start = background_tasks_can_start.cloned(); task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Eviction, @@ -57,7 +63,13 @@ impl Timeline { &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id), false, async move { - self_clone.eviction_task(task_mgr::shutdown_token()).await; + let cancel = task_mgr::shutdown_token(); + tokio::select! { + _ = cancel.cancelled() => { return Ok(()); } + _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {} + }; + + self_clone.eviction_task(cancel).await; info!("eviction task finishing"); Ok(()) }, diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 7ebf3cf172..ccff735c3c 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -25,6 +25,7 @@ mod walreceiver_connection; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME}; +use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::timeline::walreceiver::connection_manager::{ connection_manager_loop_step, ConnectionManagerState, }; @@ -85,7 +86,8 @@ impl WalReceiver { &format!("walreceiver for timeline {tenant_id}/{timeline_id}"), false, async move { - info!("WAL receiver manager started, connecting to broker"); + debug_assert_current_span_has_tenant_and_timeline_id(); + debug!("WAL receiver manager started, connecting to broker"); let mut connection_manager_state = ConnectionManagerState::new( timeline, conf, @@ -93,7 +95,7 @@ impl WalReceiver { loop { select! { _ = task_mgr::shutdown_watcher() => { - info!("WAL receiver shutdown requested, shutting down"); + trace!("WAL receiver shutdown requested, shutting down"); break; }, loop_step_result = connection_manager_loop_step( @@ -104,7 +106,7 @@ impl WalReceiver { ) => match loop_step_result { ControlFlow::Continue(()) => continue, ControlFlow::Break(()) => { - info!("Connection manager loop ended, shutting down"); + trace!("Connection manager loop ended, shutting down"); break; } }, @@ -115,7 +117,7 @@ impl WalReceiver { *loop_status.write().unwrap() = None; Ok(()) } - .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id)) + .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id)) ); Self { @@ -198,29 +200,19 @@ impl TaskHandle { TaskEvent::End(match self.join_handle.as_mut() { Some(jh) => { if !jh.is_finished() { - // Barring any implementation errors in this module, we can - // only arrive here while the task that executes the future - // passed to `Self::spawn()` is still execution. Cf the comment - // in Self::spawn(). - // - // This was logging at warning level in earlier versions, presumably - // to leave some breadcrumbs in case we had an implementation - // error that would would make us get stuck in `jh.await`. - // - // There hasn't been such a bug so far. - // But in a busy system, e.g., during pageserver restart, - // we arrive here often enough that the warning-level logs - // became a distraction. - // So, tone them down to info-level. - // - // XXX: rewrite this module to eliminate the race condition. - info!("sender is dropped while join handle is still alive"); + // See: https://github.com/neondatabase/neon/issues/2885 + trace!("sender is dropped while join handle is still alive"); } - let res = jh - .await - .map_err(|e| anyhow::anyhow!("Failed to join task: {e}")) - .and_then(|x| x); + let res = match jh.await { + Ok(res) => res, + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { + // already logged + Ok(()) + } + Err(je) => Err(anyhow::Error::new(je).context("join walreceiver task")), + }; // For cancellation-safety, drop join_handle only after successful .await. self.join_handle = None; @@ -243,12 +235,12 @@ impl TaskHandle { match jh.await { Ok(Ok(())) => debug!("Shutdown success"), Ok(Err(e)) => error!("Shutdown task error: {e:?}"), - Err(join_error) => { - if join_error.is_cancelled() { - error!("Shutdown task was cancelled"); - } else { - error!("Shutdown task join error: {join_error}") - } + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { + // already logged + } + Err(je) => { + error!("Shutdown task join error: {je}") } } } diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index c25eea1b70..dd2bd200ac 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -18,7 +18,7 @@ use crate::metrics::{ WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES, }; use crate::task_mgr::TaskKind; -use crate::tenant::Timeline; +use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline}; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use pageserver_api::models::TimelineState; @@ -55,8 +55,11 @@ pub(super) async fn connection_manager_loop_step( .await { Ok(()) => {} - Err(_) => { - info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop"); + Err(new_state) => { + debug!( + ?new_state, + "state changed, stopping wal connection manager loop" + ); return ControlFlow::Break(()); } } @@ -79,7 +82,7 @@ pub(super) async fn connection_manager_loop_step( // with other streams on this client (other connection managers). When // object goes out of scope, stream finishes in drop() automatically. let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await; - info!("Subscribed for broker timeline updates"); + debug!("Subscribed for broker timeline updates"); loop { let time_until_next_retry = connection_manager_state.time_until_next_retry(); @@ -151,12 +154,12 @@ pub(super) async fn connection_manager_loop_step( // we're already active as walreceiver, no need to reactivate TimelineState::Active => continue, TimelineState::Broken | TimelineState::Stopping => { - info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop"); + debug!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop"); return ControlFlow::Break(()); } TimelineState::Loading => { warn!("timeline transitioned back to Loading state, that should not happen"); - return ControlFlow::Continue(new_state); + return ControlFlow::Continue(()); } } } @@ -164,12 +167,11 @@ pub(super) async fn connection_manager_loop_step( } } } => match new_event { - ControlFlow::Continue(new_state) => { - info!("observed timeline state change, new state is {new_state:?}"); + ControlFlow::Continue(()) => { return ControlFlow::Continue(()); } ControlFlow::Break(()) => { - info!("Timeline dropped state updates sender, stopping wal connection manager loop"); + debug!("Timeline is no longer active, stopping wal connection manager loop"); return ControlFlow::Break(()); } }, @@ -390,7 +392,6 @@ impl ConnectionManagerState { self.drop_old_connection(true).await; - let id = self.id; let node_id = new_sk.safekeeper_id; let connect_timeout = self.conf.wal_connect_timeout; let timeline = Arc::clone(&self.timeline); @@ -398,9 +399,13 @@ impl ConnectionManagerState { TaskKind::WalReceiverConnectionHandler, DownloadBehavior::Download, ); + + let span = info_span!("connection", %node_id); let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { async move { - super::walreceiver_connection::handle_walreceiver_connection( + debug_assert_current_span_has_tenant_and_timeline_id(); + + let res = super::walreceiver_connection::handle_walreceiver_connection( timeline, new_sk.wal_source_connconf, events_sender, @@ -409,12 +414,23 @@ impl ConnectionManagerState { ctx, node_id, ) - .await - .context("walreceiver connection handling failure") + .await; + + match res { + Ok(()) => Ok(()), + Err(e) => { + use super::walreceiver_connection::ExpectedError; + if e.is_expected() { + info!("walreceiver connection handling ended: {e:#}"); + Ok(()) + } else { + // give out an error to have task_mgr give it a really verbose logging + Err(e).context("walreceiver connection handling failure") + } + } + } } - .instrument( - info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id), - ) + .instrument(span) }); let now = Utc::now().naive_utc(); diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 91ff60603a..1c1fe87305 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -21,16 +21,16 @@ use postgres_types::PgLsn; use tokio::{select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, trace, warn}; +use tracing::{debug, error, info, trace, warn, Instrument}; use super::TaskStateUpdate; -use crate::metrics::LIVE_CONNECTIONS_COUNT; -use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS}; use crate::{ + context::RequestContext, + metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS}, task_mgr, task_mgr::TaskKind, task_mgr::WALRECEIVER_RUNTIME, - tenant::{Timeline, WalReceiverInfo}, + tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, walingest::WalIngest, walrecord::DecodedWALRecord, }; @@ -81,13 +81,8 @@ pub(super) async fn handle_walreceiver_connection( config.application_name("pageserver"); config.replication_mode(tokio_postgres::config::ReplicationMode::Physical); match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await { - Ok(Ok(client_and_conn)) => client_and_conn, - Ok(Err(conn_err)) => { - let expected_error = ignore_expected_errors(conn_err)?; - info!("DB connection stream finished: {expected_error}"); - return Ok(()); - } - Err(_) => { + Ok(client_and_conn) => client_and_conn?, + Err(_elapsed) => { // Timing out to connect to a safekeeper node could happen long time, due to // many reasons that pageserver cannot control. // Do not produce an error, but make it visible, that timeouts happen by logging the `event. @@ -97,7 +92,7 @@ pub(super) async fn handle_walreceiver_connection( } }; - info!("connected!"); + debug!("connected!"); let mut connection_status = WalConnectionStatus { is_connected: true, has_processed_wal: false, @@ -127,20 +122,25 @@ pub(super) async fn handle_walreceiver_connection( "walreceiver connection", false, async move { + debug_assert_current_span_has_tenant_and_timeline_id(); + select! { connection_result = connection => match connection_result { - Ok(()) => info!("Walreceiver db connection closed"), + Ok(()) => debug!("Walreceiver db connection closed"), Err(connection_error) => { - if let Err(e) = ignore_expected_errors(connection_error) { - warn!("Connection aborted: {e:#}") + if connection_error.is_expected() { + // silence, because most likely we've already exited the outer call + // with a similar error. + } else { + warn!("Connection aborted: {connection_error:#}") } } }, - // Future: replace connection_cancellation with connection_ctx cancellation - _ = connection_cancellation.cancelled() => info!("Connection cancelled"), + _ = connection_cancellation.cancelled() => debug!("Connection cancelled"), } Ok(()) - }, + } + .instrument(tracing::info_span!("poller")), ); // Immediately increment the gauge, then create a job to decrement it on task exit. @@ -203,20 +203,13 @@ pub(super) async fn handle_walreceiver_connection( while let Some(replication_message) = { select! { _ = cancellation.cancelled() => { - info!("walreceiver interrupted"); + debug!("walreceiver interrupted"); None } replication_message = physical_stream.next() => replication_message, } } { - let replication_message = match replication_message { - Ok(message) => message, - Err(replication_error) => { - let expected_error = ignore_expected_errors(replication_error)?; - info!("Replication stream finished: {expected_error}"); - return Ok(()); - } - }; + let replication_message = replication_message?; let now = Utc::now().naive_utc(); let last_rec_lsn_before_msg = last_rec_lsn; @@ -261,8 +254,6 @@ pub(super) async fn handle_walreceiver_connection( let mut decoded = DecodedWALRecord::default(); let mut modification = timeline.begin_modification(endlsn); while let Some((lsn, recdata)) = waldecoder.poll_decode()? { - // let _enter = info_span!("processing record", lsn = %lsn).entered(); - // It is important to deal with the aligned records as lsn in getPage@LSN is // aligned and can be several bytes bigger. Without this alignment we are // at risk of hitting a deadlock. @@ -424,31 +415,50 @@ async fn identify_system(client: &mut Client) -> anyhow::Result } } -/// We don't want to report connectivity problems as real errors towards connection manager because -/// 1. they happen frequently enough to make server logs hard to read and -/// 2. the connection manager can retry other safekeeper. -/// -/// If this function returns `Ok(pg_error)`, it's such an error. -/// The caller should log it at info level and then report to connection manager that we're done handling this connection. -/// Connection manager will then handle reconnections. -/// -/// If this function returns an `Err()`, the caller can bubble it up using `?`. -/// The connection manager will log the error at ERROR level. -fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result { - if pg_error.is_closed() - || pg_error - .source() - .and_then(|source| source.downcast_ref::()) - .map(is_expected_io_error) - .unwrap_or(false) - { - return Ok(pg_error); - } else if let Some(db_error) = pg_error.as_db_error() { - if db_error.code() == &SqlState::SUCCESSFUL_COMPLETION - && db_error.message().contains("ending streaming") - { - return Ok(pg_error); - } - } - Err(pg_error).context("connection error") +/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors. +pub(super) trait ExpectedError { + /// Test if this error is an ok error. + /// + /// We don't want to report connectivity problems as real errors towards connection manager because + /// 1. they happen frequently enough to make server logs hard to read and + /// 2. the connection manager can retry other safekeeper. + /// + /// If this function returns `true`, it's such an error. + /// The caller should log it at info level and then report to connection manager that we're done handling this connection. + /// Connection manager will then handle reconnections. + /// + /// If this function returns an `false` the error should be propagated and the connection manager + /// will log the error at ERROR level. + fn is_expected(&self) -> bool; +} + +impl ExpectedError for postgres::Error { + fn is_expected(&self) -> bool { + self.is_closed() + || self + .source() + .and_then(|source| source.downcast_ref::()) + .map(is_expected_io_error) + .unwrap_or(false) + || self + .as_db_error() + .filter(|db_error| { + db_error.code() == &SqlState::SUCCESSFUL_COMPLETION + && db_error.message().contains("ending streaming") + }) + .is_some() + } +} + +impl ExpectedError for anyhow::Error { + fn is_expected(&self) -> bool { + let head = self.downcast_ref::(); + + let tail = self + .chain() + .filter_map(|e| e.downcast_ref::()); + + // check if self or any of the chained/sourced errors are expected + head.into_iter().chain(tail).any(|e| e.is_expected()) + } } diff --git a/pgxn/hnsw/Makefile b/pgxn/hnsw/Makefile new file mode 100644 index 0000000000..66436b5920 --- /dev/null +++ b/pgxn/hnsw/Makefile @@ -0,0 +1,26 @@ +EXTENSION = hnsw +EXTVERSION = 0.1.0 + +MODULE_big = hnsw +DATA = $(wildcard *--*.sql) +OBJS = hnsw.o hnswalg.o + +TESTS = $(wildcard test/sql/*.sql) +REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) +REGRESS_OPTS = --inputdir=test --load-extension=hnsw + +# For auto-vectorization: +# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html +PG_CFLAGS += -O3 +PG_CXXFLAGS += -O3 -std=c++11 +PG_LDFLAGS += -lstdc++ + +all: $(EXTENSION)--$(EXTVERSION).sql + +PG_CONFIG ?= pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) + +dist: + mkdir -p dist + git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master diff --git a/pgxn/hnsw/README.md b/pgxn/hnsw/README.md new file mode 100644 index 0000000000..bc9c8d571c --- /dev/null +++ b/pgxn/hnsw/README.md @@ -0,0 +1,25 @@ +# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors + +This ANN extension of Postgres is based +on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw), +the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper: + +[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html), +
+Dmitry Baranchuk, Artem Babenko, Yury Malkov + +# Postgres extension + +HNSW index is hold in memory (built on demand) and it's maxial size is limited +by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type). +Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters +described in the article). + +# Example of usage: + +``` +create extension hnsw; +create table embeddings(id integer primary key, payload real[]); +create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32); +select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100; +``` \ No newline at end of file diff --git a/pgxn/hnsw/hnsw--0.1.0.sql b/pgxn/hnsw/hnsw--0.1.0.sql new file mode 100644 index 0000000000..ebf424326d --- /dev/null +++ b/pgxn/hnsw/hnsw--0.1.0.sql @@ -0,0 +1,29 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION hnsw" to load this file. \quit + +-- functions + +CREATE FUNCTION l2_distance(real[], real[]) RETURNS real + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- operators + +CREATE OPERATOR <-> ( + LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance, + COMMUTATOR = '<->' +); + +-- access method + +CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler; + +COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method'; + +-- opclasses + +CREATE OPERATOR CLASS knn_ops + DEFAULT FOR TYPE real[] USING hnsw AS + OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops; diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c new file mode 100644 index 0000000000..434f4986f8 --- /dev/null +++ b/pgxn/hnsw/hnsw.c @@ -0,0 +1,551 @@ +#include "postgres.h" + +#include "access/amapi.h" +#include "access/generic_xlog.h" +#include "access/relation.h" +#include "access/reloptions.h" +#include "access/tableam.h" +#include "catalog/index.h" +#include "commands/vacuum.h" +#include "nodes/execnodes.h" +#include "storage/bufmgr.h" +#include "utils/guc.h" +#include "utils/selfuncs.h" + +#include +#include + +#include "hnsw.h" + +PG_MODULE_MAGIC; + +typedef struct { + int32 vl_len_; /* varlena header (do not touch directly!) */ + int dims; + int maxelements; + int efConstruction; + int efSearch; + int M; +} HnswOptions; + +static relopt_kind hnsw_relopt_kind; + +typedef struct { + HierarchicalNSW* hnsw; + size_t curr; + size_t n_results; + ItemPointer results; +} HnswScanOpaqueData; + +typedef HnswScanOpaqueData* HnswScanOpaque; + +typedef struct { + Oid relid; + uint32 status; + HierarchicalNSW* hnsw; +} HnswHashEntry; + + +#define SH_PREFIX hnsw_index +#define SH_ELEMENT_TYPE HnswHashEntry +#define SH_KEY_TYPE Oid +#define SH_KEY relid +#define SH_STORE_HASH +#define SH_GET_HASH(tb, a) ((a)->relid) +#define SH_HASH_KEY(tb, key) (key) +#define SH_EQUAL(tb, a, b) ((a) == (b)) +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + +#define INDEX_HASH_SIZE 11 + +#define DEFAULT_EF_SEARCH 64 + +PGDLLEXPORT void _PG_init(void); + +static hnsw_index_hash *hnsw_indexes; + +/* + * Initialize index options and variables + */ +void +_PG_init(void) +{ + hnsw_relopt_kind = add_reloption_kind(); + add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions", + 0, 0, INT_MAX, AccessExclusiveLock); + add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements", + 0, 0, INT_MAX, AccessExclusiveLock); + add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex", + 100, 0, INT_MAX, AccessExclusiveLock); + add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction", + 16, 1, INT_MAX, AccessExclusiveLock); + add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search", + 64, 1, INT_MAX, AccessExclusiveLock); + hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL); +} + + +static void +hnsw_build_callback(Relation index, ItemPointer tid, Datum *values, + bool *isnull, bool tupleIsAlive, void *state) +{ + HierarchicalNSW* hnsw = (HierarchicalNSW*) state; + ArrayType* array; + int n_items; + label_t label = 0; + + /* Skip nulls */ + if (isnull[0]) + return; + + array = DatumGetArrayTypeP(values[0]); + n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); + if (n_items != hnsw_dimensions(hnsw)) + { + elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", + n_items, hnsw_dimensions(hnsw)); + } + + memcpy(&label, tid, sizeof(*tid)); + hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label); +} + +static void +hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel) +{ + IndexInfo* indexInfo = BuildIndexInfo(indexRel); + Assert(indexInfo->ii_NumIndexAttrs == 1); + table_index_build_scan(heapRel, indexRel, indexInfo, + true, true, hnsw_build_callback, (void *) hnsw, NULL); +} + +static HierarchicalNSW* +hnsw_get_index(Relation indexRel, Relation heapRel) +{ + HierarchicalNSW* hnsw; + Oid indexoid = RelationGetRelid(indexRel); + HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid); + if (entry == NULL) + { + size_t dims, maxelements; + size_t M; + size_t maxM; + size_t size_links_level0; + size_t size_data_per_element; + size_t data_size; + dsm_handle handle = indexoid << 1; /* make it even */ + void* impl_private = NULL; + void* mapped_address = NULL; + Size mapped_size = 0; + Size shmem_size; + bool exists = true; + bool found; + HnswOptions *opts = (HnswOptions *) indexRel->rd_options; + if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) { + elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified"); + } + dims = opts->dims; + maxelements = opts->maxelements; + M = opts->M; + maxM = M * 2; + data_size = dims * sizeof(coord_t); + size_links_level0 = (maxM + 1) * sizeof(idx_t); + size_data_per_element = size_links_level0 + data_size + sizeof(label_t); + shmem_size = hnsw_sizeof() + maxelements * size_data_per_element; + + /* first try to attach to existed index */ + if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private, + &mapped_address, &mapped_size, DEBUG1)) + { + /* index doesn't exists: try to create it */ + if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private, + &mapped_address, &mapped_size, DEBUG1)) + { + /* We can do it under shared lock, so some other backend may + * try to initialize index. If create is failed because index already + * created by somebody else, then try to attach to it once again + */ + if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private, + &mapped_address, &mapped_size, ERROR)) + { + return NULL; + } + } + else + { + exists = false; + } + } + Assert(mapped_size == shmem_size); + hnsw = (HierarchicalNSW*)mapped_address; + + if (!exists) + { + hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction); + hnsw_populate(hnsw, indexRel, heapRel); + } + entry = hnsw_index_insert(hnsw_indexes, indexoid, &found); + Assert(!found); + entry->hnsw = hnsw; + } + else + { + hnsw = entry->hnsw; + } + return hnsw; +} + +/* + * Start or restart an index scan + */ +static IndexScanDesc +hnsw_beginscan(Relation index, int nkeys, int norderbys) +{ + IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys); + HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData)); + Relation heap = relation_open(index->rd_index->indrelid, NoLock); + so->hnsw = hnsw_get_index(index, heap); + relation_close(heap, NoLock); + so->curr = 0; + so->n_results = 0; + so->results = NULL; + scan->opaque = so; + return scan; +} + +/* + * Start or restart an index scan + */ +static void +hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys) +{ + HnswScanOpaque so = (HnswScanOpaque) scan->opaque; + if (so->results) + { + pfree(so->results); + so->results = NULL; + } + so->curr = 0; + if (orderbys && scan->numberOfOrderBys > 0) + memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData)); +} + +/* + * Fetch the next tuple in the given scan + */ +static bool +hnsw_gettuple(IndexScanDesc scan, ScanDirection dir) +{ + HnswScanOpaque so = (HnswScanOpaque) scan->opaque; + + /* + * Index can be used to scan backward, but Postgres doesn't support + * backward scan on operators + */ + Assert(ScanDirectionIsForward(dir)); + + if (so->curr == 0) + { + Datum value; + ArrayType* array; + int n_items; + size_t n_results; + label_t* results; + HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options; + size_t efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH; + + /* Safety check */ + if (scan->orderByData == NULL) + elog(ERROR, "cannot scan HNSW index without order"); + + /* No items will match if null */ + if (scan->orderByData->sk_flags & SK_ISNULL) + return false; + + value = scan->orderByData->sk_argument; + array = DatumGetArrayTypeP(value); + n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); + if (n_items != hnsw_dimensions(so->hnsw)) + { + elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", + n_items, hnsw_dimensions(so->hnsw)); + } + + if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results)) + elog(ERROR, "HNSW index search failed"); + so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData)); + so->n_results = n_results; + for (size_t i = 0; i < n_results; i++) + { + memcpy(&so->results[i], &results[i], sizeof(so->results[i])); + } + free(results); + } + if (so->curr >= so->n_results) + { + return false; + } + else + { + scan->xs_heaptid = so->results[so->curr++]; + scan->xs_recheckorderby = false; + return true; + } +} + +/* + * End a scan and release resources + */ +static void +hnsw_endscan(IndexScanDesc scan) +{ + HnswScanOpaque so = (HnswScanOpaque) scan->opaque; + if (so->results) + pfree(so->results); + pfree(so); + scan->opaque = NULL; +} + + +/* + * Estimate the cost of an index scan + */ +static void +hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count, + Cost *indexStartupCost, Cost *indexTotalCost, + Selectivity *indexSelectivity, double *indexCorrelation + ,double *indexPages +) +{ + GenericCosts costs; + + /* Never use index without order */ + if (path->indexorderbys == NULL) + { + *indexStartupCost = DBL_MAX; + *indexTotalCost = DBL_MAX; + *indexSelectivity = 0; + *indexCorrelation = 0; + *indexPages = 0; + return; + } + + MemSet(&costs, 0, sizeof(costs)); + + genericcostestimate(root, path, loop_count, &costs); + + /* Startup cost and total cost are same */ + *indexStartupCost = costs.indexTotalCost; + *indexTotalCost = costs.indexTotalCost; + *indexSelectivity = costs.indexSelectivity; + *indexCorrelation = costs.indexCorrelation; + *indexPages = costs.numIndexPages; +} + +/* + * Parse and validate the reloptions + */ +static bytea * +hnsw_options(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)}, + {"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)}, + {"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)}, + {"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)}, + {"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)} + }; + + return (bytea *) build_reloptions(reloptions, validate, + hnsw_relopt_kind, + sizeof(HnswOptions), + tab, lengthof(tab)); +} + +/* + * Validate catalog entries for the specified operator class + */ +static bool +hnsw_validate(Oid opclassoid) +{ + return true; +} + +/* + * Build the index for a logged table + */ +static IndexBuildResult * +hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo) +{ + HierarchicalNSW* hnsw = hnsw_get_index(index, heap); + IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + result->heap_tuples = result->index_tuples = hnsw_count(hnsw); + + return result; +} + +/* + * Insert a tuple into the index + */ +static bool +hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid, + Relation heap, IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + HierarchicalNSW* hnsw = hnsw_get_index(index, heap); + Datum value; + ArrayType* array; + int n_items; + label_t label = 0; + + /* Skip nulls */ + if (isnull[0]) + return false; + + /* Detoast value */ + value = PointerGetDatum(PG_DETOAST_DATUM(values[0])); + array = DatumGetArrayTypeP(value); + n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); + if (n_items != hnsw_dimensions(hnsw)) + { + elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", + n_items, hnsw_dimensions(hnsw)); + } + memcpy(&label, heap_tid, sizeof(*heap_tid)); + if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label)) + elog(ERROR, "HNSW index insert failed"); + return true; +} + +/* + * Build the index for an unlogged table + */ +static void +hnsw_buildempty(Relation index) +{ + /* index will be constructed on dema nd when accessed */ +} + +/* + * Clean up after a VACUUM operation + */ +static IndexBulkDeleteResult * +hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + Relation rel = info->index; + + if (stats == NULL) + return NULL; + + stats->num_pages = RelationGetNumberOfBlocks(rel); + + return stats; +} + +/* + * Bulk delete tuples from the index + */ +static IndexBulkDeleteResult * +hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + return stats; +} + +/* + * Define index handler + * + * See https://www.postgresql.org/docs/current/index-api.html + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler); +Datum +hnsw_handler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = 0; + amroutine->amsupport = 0; + amroutine->amoptsprocnum = 0; + amroutine->amcanorder = false; + amroutine->amcanorderbyop = true; + amroutine->amcanbackward = false; /* can change direction mid-scan */ + amroutine->amcanunique = false; + amroutine->amcanmulticol = false; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = false; + amroutine->amstorage = false; + amroutine->amclusterable = false; + amroutine->ampredlocks = false; + amroutine->amcanparallel = false; + amroutine->amcaninclude = false; + amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */ + amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL; + amroutine->amkeytype = InvalidOid; + + /* Interface functions */ + amroutine->ambuild = hnsw_build; + amroutine->ambuildempty = hnsw_buildempty; + amroutine->aminsert = hnsw_insert; + amroutine->ambulkdelete = hnsw_bulkdelete; + amroutine->amvacuumcleanup = hnsw_vacuumcleanup; + amroutine->amcanreturn = NULL; /* tuple not included in heapsort */ + amroutine->amcostestimate = hnsw_costestimate; + amroutine->amoptions = hnsw_options; + amroutine->amproperty = NULL; /* TODO AMPROP_DISTANCE_ORDERABLE */ + amroutine->ambuildphasename = NULL; + amroutine->amvalidate = hnsw_validate; + amroutine->amadjustmembers = NULL; + amroutine->ambeginscan = hnsw_beginscan; + amroutine->amrescan = hnsw_rescan; + amroutine->amgettuple = hnsw_gettuple; + amroutine->amgetbitmap = NULL; + amroutine->amendscan = hnsw_endscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + + /* Interface functions to support parallel index scans */ + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + PG_RETURN_POINTER(amroutine); +} + +/* + * Get the L2 distance between vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance); +Datum +l2_distance(PG_FUNCTION_ARGS) +{ + ArrayType *a = PG_GETARG_ARRAYTYPE_P(0); + ArrayType *b = PG_GETARG_ARRAYTYPE_P(1); + int a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a)); + int b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b)); + dist_t distance = 0.0; + dist_t diff; + coord_t *ax = (coord_t*)ARR_DATA_PTR(a); + coord_t *bx = (coord_t*)ARR_DATA_PTR(b); + + if (a_dim != b_dim) + { + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("different array dimensions %d and %d", a_dim, b_dim))); + } + + for (int i = 0; i < a_dim; i++) + { + diff = ax[i] - bx[i]; + distance += diff * diff; + } + + PG_RETURN_FLOAT4((dist_t)sqrt(distance)); +} diff --git a/pgxn/hnsw/hnsw.control b/pgxn/hnsw/hnsw.control new file mode 100644 index 0000000000..b292b96026 --- /dev/null +++ b/pgxn/hnsw/hnsw.control @@ -0,0 +1,5 @@ +comment = 'hNsw index' +default_version = '0.1.0' +module_pathname = '$libdir/hnsw' +relocatable = true +trusted = true diff --git a/pgxn/hnsw/hnsw.h b/pgxn/hnsw/hnsw.h new file mode 100644 index 0000000000..d4065ab8fe --- /dev/null +++ b/pgxn/hnsw/hnsw.h @@ -0,0 +1,15 @@ +#pragma once + +typedef float coord_t; +typedef float dist_t; +typedef uint32_t idx_t; +typedef uint64_t label_t; + +typedef struct HierarchicalNSW HierarchicalNSW; + +bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results); +bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label); +void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction); +int hnsw_dimensions(HierarchicalNSW* hnsw); +size_t hnsw_count(HierarchicalNSW* hnsw); +size_t hnsw_sizeof(void); diff --git a/pgxn/hnsw/hnswalg.cpp b/pgxn/hnsw/hnswalg.cpp new file mode 100644 index 0000000000..f6de3b8314 --- /dev/null +++ b/pgxn/hnsw/hnswalg.cpp @@ -0,0 +1,379 @@ +#include "hnswalg.h" + +#if defined(__GNUC__) +#define PORTABLE_ALIGN32 __attribute__((aligned(32))) +#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint) +#else +#define PORTABLE_ALIGN32 __declspec(align(32)) +#define PREFETCH(addr,hint) +#endif + +HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_) +{ + dim = dim_; + data_size = dim * sizeof(coord_t); + + efConstruction = efConstruction_; + + maxelements = maxelements_; + M = M_; + maxM = maxM_; + size_links_level0 = (maxM + 1) * sizeof(idx_t); + size_data_per_element = size_links_level0 + data_size + sizeof(label_t); + offset_data = size_links_level0; + offset_label = offset_data + data_size; + + enterpoint_node = 0; + cur_element_count = 0; +#ifdef __x86_64__ + use_avx2 = __builtin_cpu_supports("avx2"); +#endif +} + +std::priority_queue> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef) +{ + std::vector visited; + visited.resize((cur_element_count + 31) >> 5); + + std::priority_queue> topResults; + std::priority_queue> candidateSet; + + dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node)); + + topResults.emplace(dist, enterpoint_node); + candidateSet.emplace(-dist, enterpoint_node); + visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31); + dist_t lowerBound = dist; + + while (!candidateSet.empty()) + { + std::pair curr_el_pair = candidateSet.top(); + if (-curr_el_pair.first > lowerBound) + break; + + candidateSet.pop(); + idx_t curNodeNum = curr_el_pair.second; + + idx_t* data = get_linklist0(curNodeNum); + size_t size = *data++; + + PREFETCH(getDataByInternalId(*data), 0); + + for (size_t j = 0; j < size; ++j) { + size_t tnum = *(data + j); + + PREFETCH(getDataByInternalId(*(data + j + 1)), 0); + + if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) { + visited[tnum >> 5] |= 1 << (tnum & 31); + + dist = fstdistfunc(point, getDataByInternalId(tnum)); + + if (topResults.top().first > dist || topResults.size() < ef) { + candidateSet.emplace(-dist, tnum); + + PREFETCH(get_linklist0(candidateSet.top().second), 0); + topResults.emplace(dist, tnum); + + if (topResults.size() > ef) + topResults.pop(); + + lowerBound = topResults.top().first; + } + } + } + } + return topResults; +} + + +void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue> &topResults, size_t NN) +{ + if (topResults.size() < NN) + return; + + std::priority_queue> resultSet; + std::vector> returnlist; + + while (topResults.size() > 0) { + resultSet.emplace(-topResults.top().first, topResults.top().second); + topResults.pop(); + } + + while (resultSet.size()) { + if (returnlist.size() >= NN) + break; + std::pair curen = resultSet.top(); + dist_t dist_to_query = -curen.first; + resultSet.pop(); + bool good = true; + for (std::pair curen2 : returnlist) { + dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second), + getDataByInternalId(curen.second)); + if (curdist < dist_to_query) { + good = false; + break; + } + } + if (good) returnlist.push_back(curen); + } + for (std::pair elem : returnlist) + topResults.emplace(-elem.first, elem.second); +} + +void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c, + std::priority_queue> topResults) +{ + getNeighborsByHeuristic(topResults, M); + + std::vector res; + res.reserve(M); + while (topResults.size() > 0) { + res.push_back(topResults.top().second); + topResults.pop(); + } + { + idx_t* data = get_linklist0(cur_c); + if (*data) + throw std::runtime_error("Should be blank"); + + *data++ = res.size(); + + for (size_t idx = 0; idx < res.size(); idx++) { + if (data[idx]) + throw std::runtime_error("Should be blank"); + data[idx] = res[idx]; + } + } + for (size_t idx = 0; idx < res.size(); idx++) { + if (res[idx] == cur_c) + throw std::runtime_error("Connection to the same element"); + + size_t resMmax = maxM; + idx_t *ll_other = get_linklist0(res[idx]); + idx_t sz_link_list_other = *ll_other; + + if (sz_link_list_other > resMmax || sz_link_list_other < 0) + throw std::runtime_error("Bad sz_link_list_other"); + + if (sz_link_list_other < resMmax) { + idx_t *data = ll_other + 1; + data[sz_link_list_other] = cur_c; + *ll_other = sz_link_list_other + 1; + } else { + // finding the "weakest" element to replace it with the new one + idx_t *data = ll_other + 1; + dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx])); + // Heuristic: + std::priority_queue> candidates; + candidates.emplace(d_max, cur_c); + + for (size_t j = 0; j < sz_link_list_other; j++) + candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]); + + getNeighborsByHeuristic(candidates, resMmax); + + size_t indx = 0; + while (!candidates.empty()) { + data[indx] = candidates.top().second; + candidates.pop(); + indx++; + } + *ll_other = indx; + } + } +} + +void HierarchicalNSW::addPoint(const coord_t *point, label_t label) +{ + if (cur_element_count >= maxelements) { + throw std::runtime_error("The number of elements exceeds the specified limit"); + } + idx_t cur_c = cur_element_count++; + memset((char *) get_linklist0(cur_c), 0, size_data_per_element); + memcpy(getDataByInternalId(cur_c), point, data_size); + memcpy(getExternalLabel(cur_c), &label, sizeof label); + + // Do nothing for the first element + if (cur_c != 0) { + std::priority_queue > topResults = searchBaseLayer(point, efConstruction); + mutuallyConnectNewElement(point, cur_c, topResults); + } +}; + +std::priority_queue> HierarchicalNSW::searchKnn(const coord_t *query, size_t k) +{ + std::priority_queue> topResults; + auto topCandidates = searchBaseLayer(query, k); + while (topCandidates.size() > k) { + topCandidates.pop(); + } + while (!topCandidates.empty()) { + std::pair rez = topCandidates.top(); + label_t label; + memcpy(&label, getExternalLabel(rez.second), sizeof(label)); + topResults.push(std::pair(rez.first, label)); + topCandidates.pop(); + } + + return topResults; +}; + +dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n) +{ + dist_t distance = 0.0; + + for (size_t i = 0; i < n; i++) + { + dist_t diff = x[i] - y[i]; + distance += diff * diff; + } + return distance; + +} + +#ifdef __x86_64__ +#include + +__attribute__((target("avx2"))) +dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n) +{ + const size_t TmpResSz = sizeof(__m256) / sizeof(float); + float PORTABLE_ALIGN32 TmpRes[TmpResSz]; + size_t qty16 = n / 16; + const float *pEnd1 = x + (qty16 * 16); + __m256 diff, v1, v2; + __m256 sum = _mm256_set1_ps(0); + + while (x < pEnd1) { + v1 = _mm256_loadu_ps(x); + x += 8; + v2 = _mm256_loadu_ps(y); + y += 8; + diff = _mm256_sub_ps(v1, v2); + sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); + + v1 = _mm256_loadu_ps(x); + x += 8; + v2 = _mm256_loadu_ps(y); + y += 8; + diff = _mm256_sub_ps(v1, v2); + sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); + } + _mm256_store_ps(TmpRes, sum); + float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; + return (res); +} + +dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n) +{ + const size_t TmpResSz = sizeof(__m128) / sizeof(float); + float PORTABLE_ALIGN32 TmpRes[TmpResSz]; + size_t qty16 = n / 16; + const float *pEnd1 = x + (qty16 * 16); + + __m128 diff, v1, v2; + __m128 sum = _mm_set1_ps(0); + + while (x < pEnd1) { + v1 = _mm_loadu_ps(x); + x += 4; + v2 = _mm_loadu_ps(y); + y += 4; + diff = _mm_sub_ps(v1, v2); + sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); + + v1 = _mm_loadu_ps(x); + x += 4; + v2 = _mm_loadu_ps(y); + y += 4; + diff = _mm_sub_ps(v1, v2); + sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); + + v1 = _mm_loadu_ps(x); + x += 4; + v2 = _mm_loadu_ps(y); + y += 4; + diff = _mm_sub_ps(v1, v2); + sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); + + v1 = _mm_loadu_ps(x); + x += 4; + v2 = _mm_loadu_ps(y); + y += 4; + diff = _mm_sub_ps(v1, v2); + sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); + } + _mm_store_ps(TmpRes, sum); + float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + return res; +} +#endif + +dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y) +{ +#ifndef __x86_64__ + return fstdistfunc_scalar(x, y, dim); +#else + if(use_avx2) + return fstdistfunc_avx2(x, y, dim); + + return fstdistfunc_sse(x, y, dim); +#endif +} + +bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results) +{ + try + { + auto result = hnsw->searchKnn(point, efSearch); + size_t nResults = result.size(); + *results = (label_t*)malloc(nResults*sizeof(label_t)); + for (size_t i = nResults; i-- != 0;) + { + (*results)[i] = result.top().second; + result.pop(); + } + *n_results = nResults; + return true; + } + catch (std::exception& x) + { + return false; + } +} + +bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label) +{ + try + { + hnsw->addPoint(point, label); + return true; + } + catch (std::exception& x) + { + fprintf(stderr, "Catch %s\n", x.what()); + return false; + } +} + +void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction) +{ + new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction); +} + + +int hnsw_dimensions(HierarchicalNSW* hnsw) +{ + return (int)hnsw->dim; +} + +size_t hnsw_count(HierarchicalNSW* hnsw) +{ + return hnsw->cur_element_count; +} + +size_t hnsw_sizeof(void) +{ + return sizeof(HierarchicalNSW); +} diff --git a/pgxn/hnsw/hnswalg.h b/pgxn/hnsw/hnswalg.h new file mode 100644 index 0000000000..f38aeac362 --- /dev/null +++ b/pgxn/hnsw/hnswalg.h @@ -0,0 +1,69 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern "C" { +#include "hnsw.h" +} + +struct HierarchicalNSW +{ + size_t maxelements; + size_t cur_element_count; + + idx_t enterpoint_node; + + size_t dim; + size_t data_size; + size_t offset_data; + size_t offset_label; + size_t size_data_per_element; + size_t M; + size_t maxM; + size_t size_links_level0; + size_t efConstruction; + +#ifdef __x86_64__ + bool use_avx2; +#endif + + char data_level0_memory[0]; // varying size + + public: + HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction); + ~HierarchicalNSW(); + + + inline coord_t *getDataByInternalId(idx_t internal_id) const { + return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data]; + } + + inline idx_t *get_linklist0(idx_t internal_id) const { + return (idx_t*)&data_level0_memory[internal_id * size_data_per_element]; + } + + inline label_t *getExternalLabel(idx_t internal_id) const { + return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label]; + } + + std::priority_queue> searchBaseLayer(const coord_t *x, size_t ef); + + void getNeighborsByHeuristic(std::priority_queue> &topResults, size_t NN); + + void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue> topResults); + + void addPoint(const coord_t *point, label_t label); + + std::priority_queue> searchKnn(const coord_t *query_data, size_t k); + + dist_t fstdistfunc(const coord_t *x, const coord_t *y); +}; diff --git a/pgxn/hnsw/test/expected/knn.out b/pgxn/hnsw/test/expected/knn.out new file mode 100644 index 0000000000..a1cee4525e --- /dev/null +++ b/pgxn/hnsw/test/expected/knn.out @@ -0,0 +1,28 @@ +SET enable_seqscan = off; +CREATE TABLE t (val real[]); +INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL); +CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3); +INSERT INTO t (val) VALUES (array[1,2,4]); +explain SELECT * FROM t ORDER BY val <-> array[3,3,3]; + QUERY PLAN +-------------------------------------------------------------------- + Index Scan using t_val_idx on t (cost=4.02..8.06 rows=3 width=36) + Order By: (val <-> '{3,3,3}'::real[]) +(2 rows) + +SELECT * FROM t ORDER BY val <-> array[3,3,3]; + val +--------- + {1,2,3} + {1,2,4} + {1,1,1} + {0,0,0} +(4 rows) + +SELECT COUNT(*) FROM t; + count +------- + 5 +(1 row) + +DROP TABLE t; diff --git a/pgxn/hnsw/test/sql/knn.sql b/pgxn/hnsw/test/sql/knn.sql new file mode 100644 index 0000000000..0635bda4a2 --- /dev/null +++ b/pgxn/hnsw/test/sql/knn.sql @@ -0,0 +1,13 @@ +SET enable_seqscan = off; + +CREATE TABLE t (val real[]); +INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL); +CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3); + +INSERT INTO t (val) VALUES (array[1,2,4]); + +explain SELECT * FROM t ORDER BY val <-> array[3,3,3]; +SELECT * FROM t ORDER BY val <-> array[3,3,3]; +SELECT COUNT(*) FROM t; + +DROP TABLE t; diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index ec377dbb1e..1948023472 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -11,10 +11,12 @@ OBJS = \ pagestore_smgr.o \ relsize_cache.o \ walproposer.o \ - walproposer_utils.o + walproposer_utils.o \ + control_plane_connector.o PG_CPPFLAGS = -I$(libpq_srcdir) SHLIB_LINK_INTERNAL = $(libpq) +SHLIB_LINK = -lcurl EXTENSION = neon DATA = neon--1.0.sql diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c new file mode 100644 index 0000000000..82e4af4b4a --- /dev/null +++ b/pgxn/neon/control_plane_connector.c @@ -0,0 +1,830 @@ +/*------------------------------------------------------------------------- + * + * control_plane_connector.c + * Captures updates to roles/databases using ProcessUtility_hook and + * sends them to the control ProcessUtility_hook. The changes are sent + * via HTTP to the URL specified by the GUC neon.console_url when the + * transaction commits. Forwarding may be disabled temporarily by + * setting neon.forward_ddl to false. + * + * Currently, the transaction may abort AFTER + * changes have already been forwarded, and that case is not handled. + * Subtransactions are handled using a stack of hash tables, which + * accumulate changes. On subtransaction commit, the top of the stack + * is merged with the table below it. + * + * IDENTIFICATION + * contrib/neon/control_plane_connector.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "tcop/pquery.h" +#include "tcop/utility.h" +#include "access/xact.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" +#include "commands/defrem.h" +#include "miscadmin.h" +#include "utils/acl.h" +#include "fmgr.h" +#include "utils/guc.h" +#include "port.h" +#include +#include "utils/jsonb.h" + +static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; + +/* GUCs */ +static char *ConsoleURL = NULL; +static bool ForwardDDL = true; + +/* Curl structures for sending the HTTP requests */ +static CURL * CurlHandle; +static struct curl_slist *ContentHeader = NULL; + +/* + * CURL docs say that this buffer must exist until we call curl_easy_cleanup + * (which we never do), so we make this a static + */ +static char CurlErrorBuf[CURL_ERROR_SIZE]; + +typedef enum +{ + Op_Set, /* An upsert: Either a creation or an alter */ + Op_Delete, +} OpType; + +typedef struct +{ + char name[NAMEDATALEN]; + Oid owner; + char old_name[NAMEDATALEN]; + OpType type; +} DbEntry; + +typedef struct +{ + char name[NAMEDATALEN]; + char old_name[NAMEDATALEN]; + const char *password; + OpType type; +} RoleEntry; + +/* + * We keep one of these for each subtransaction in a stack. When a subtransaction + * commits, we merge the top of the stack into the table below it. It is allocated in the + * subtransaction's context. + */ +typedef struct DdlHashTable +{ + struct DdlHashTable *prev_table; + HTAB *db_table; + HTAB *role_table; +} DdlHashTable; + +static DdlHashTable RootTable; +static DdlHashTable * CurrentDdlTable = &RootTable; + +static void +PushKeyValue(JsonbParseState **state, char *key, char *value) +{ + JsonbValue k, + v; + + k.type = jbvString; + k.val.string.len = strlen(key); + k.val.string.val = key; + v.type = jbvString; + v.val.string.len = strlen(value); + v.val.string.val = value; + pushJsonbValue(state, WJB_KEY, &k); + pushJsonbValue(state, WJB_VALUE, &v); +} + +static char * +ConstructDeltaMessage() +{ + JsonbParseState *state = NULL; + + pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); + if (RootTable.db_table) + { + JsonbValue dbs; + + dbs.type = jbvString; + dbs.val.string.val = "dbs"; + dbs.val.string.len = strlen(dbs.val.string.val); + pushJsonbValue(&state, WJB_KEY, &dbs); + pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL); + + HASH_SEQ_STATUS status; + DbEntry *entry; + + hash_seq_init(&status, RootTable.db_table); + while ((entry = hash_seq_search(&status)) != NULL) + { + pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); + PushKeyValue(&state, "op", entry->type == Op_Set ? "set" : "del"); + PushKeyValue(&state, "name", entry->name); + if (entry->owner != InvalidOid) + { + PushKeyValue(&state, "owner", GetUserNameFromId(entry->owner, false)); + } + if (entry->old_name[0] != '\0') + { + PushKeyValue(&state, "old_name", entry->old_name); + } + pushJsonbValue(&state, WJB_END_OBJECT, NULL); + } + pushJsonbValue(&state, WJB_END_ARRAY, NULL); + } + + if (RootTable.role_table) + { + JsonbValue roles; + + roles.type = jbvString; + roles.val.string.val = "roles"; + roles.val.string.len = strlen(roles.val.string.val); + pushJsonbValue(&state, WJB_KEY, &roles); + pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL); + + HASH_SEQ_STATUS status; + RoleEntry *entry; + + hash_seq_init(&status, RootTable.role_table); + while ((entry = hash_seq_search(&status)) != NULL) + { + pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL); + PushKeyValue(&state, "op", entry->type == Op_Set ? "set" : "del"); + PushKeyValue(&state, "name", entry->name); + if (entry->password) + { + PushKeyValue(&state, "password", (char *) entry->password); + } + if (entry->old_name[0] != '\0') + { + PushKeyValue(&state, "old_name", entry->old_name); + } + pushJsonbValue(&state, WJB_END_OBJECT, NULL); + } + pushJsonbValue(&state, WJB_END_ARRAY, NULL); + } + JsonbValue *result = pushJsonbValue(&state, WJB_END_OBJECT, NULL); + Jsonb *jsonb = JsonbValueToJsonb(result); + + return JsonbToCString(NULL, &jsonb->root, 0 /* estimated_len */ ); +} + +#define ERROR_SIZE 1024 + +typedef struct +{ + char str[ERROR_SIZE]; + size_t size; +} ErrorString; + +static size_t +ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata) +{ + /* Docs say size is always 1 */ + ErrorString *str = userdata; + + size_t to_write = nmemb; + + /* +1 for null terminator */ + if (str->size + nmemb + 1 >= ERROR_SIZE) + to_write = ERROR_SIZE - str->size - 1; + + /* Ignore everyrthing past the first ERROR_SIZE bytes */ + if (to_write == 0) + return nmemb; + memcpy(str->str + str->size, ptr, to_write); + str->size += to_write; + str->str[str->size] = '\0'; + return nmemb; +} + +static void +SendDeltasToControlPlane() +{ + if (!RootTable.db_table && !RootTable.role_table) + return; + if (!ConsoleURL) + { + elog(LOG, "ConsoleURL not set, skipping forwarding"); + return; + } + if (!ForwardDDL) + return; + + char *message = ConstructDeltaMessage(); + ErrorString str = {}; + + curl_easy_setopt(CurlHandle, CURLOPT_CUSTOMREQUEST, "PATCH"); + curl_easy_setopt(CurlHandle, CURLOPT_HTTPHEADER, ContentHeader); + curl_easy_setopt(CurlHandle, CURLOPT_POSTFIELDS, message); + curl_easy_setopt(CurlHandle, CURLOPT_URL, ConsoleURL); + curl_easy_setopt(CurlHandle, CURLOPT_ERRORBUFFER, CurlErrorBuf); + curl_easy_setopt(CurlHandle, CURLOPT_TIMEOUT, 3L /* seconds */ ); + curl_easy_setopt(CurlHandle, CURLOPT_WRITEDATA, &str); + curl_easy_setopt(CurlHandle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback); + + const int num_retries = 5; + int curl_status; + + for (int i = 0; i < num_retries; i++) + { + if ((curl_status = curl_easy_perform(CurlHandle)) == 0) + break; + elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf); + pg_usleep(1000 * 1000); + } + if (curl_status != 0) + { + elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf); + } + else + { + long response_code; + + if (curl_easy_getinfo(CurlHandle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION) + { + bool error_exists = str.size != 0; + + if (response_code != 200) + { + if (error_exists) + { + elog(ERROR, + "Received HTTP code %ld from control plane: %s", + response_code, + str.str); + } + else + { + elog(ERROR, + "Received HTTP code %ld from control plane", + response_code); + } + } + } + } +} + +static void +InitDbTableIfNeeded() +{ + if (!CurrentDdlTable->db_table) + { + HASHCTL db_ctl = {}; + + db_ctl.keysize = NAMEDATALEN; + db_ctl.entrysize = sizeof(DbEntry); + db_ctl.hcxt = CurTransactionContext; + CurrentDdlTable->db_table = hash_create( + "Dbs Created", + 4, + &db_ctl, + HASH_ELEM | HASH_STRINGS | HASH_CONTEXT); + } +} + +static void +InitRoleTableIfNeeded() +{ + if (!CurrentDdlTable->role_table) + { + HASHCTL role_ctl = {}; + + role_ctl.keysize = NAMEDATALEN; + role_ctl.entrysize = sizeof(RoleEntry); + role_ctl.hcxt = CurTransactionContext; + CurrentDdlTable->role_table = hash_create( + "Roles Created", + 4, + &role_ctl, + HASH_ELEM | HASH_STRINGS | HASH_CONTEXT); + } +} + +static void +PushTable() +{ + DdlHashTable *new_table = MemoryContextAlloc(CurTransactionContext, sizeof(DdlHashTable)); + + new_table->prev_table = CurrentDdlTable; + new_table->role_table = NULL; + new_table->db_table = NULL; + CurrentDdlTable = new_table; +} + +static void +MergeTable() +{ + DdlHashTable *old_table = CurrentDdlTable; + + CurrentDdlTable = old_table->prev_table; + + if (old_table->db_table) + { + InitDbTableIfNeeded(); + DbEntry *entry; + HASH_SEQ_STATUS status; + + hash_seq_init(&status, old_table->db_table); + while ((entry = hash_seq_search(&status)) != NULL) + { + DbEntry *to_write = hash_search( + CurrentDdlTable->db_table, + entry->name, + HASH_ENTER, + NULL); + + to_write->type = entry->type; + if (entry->owner != InvalidOid) + to_write->owner = entry->owner; + strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); + if (entry->old_name[0] != '\0') + { + bool found_old = false; + DbEntry *old = hash_search( + CurrentDdlTable->db_table, + entry->old_name, + HASH_FIND, + &found_old); + + if (found_old) + { + if (old->old_name[0] != '\0') + strlcpy(to_write->old_name, old->old_name, NAMEDATALEN); + else + strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); + hash_search( + CurrentDdlTable->db_table, + entry->old_name, + HASH_REMOVE, + NULL); + } + } + } + hash_destroy(old_table->db_table); + } + + if (old_table->role_table) + { + InitRoleTableIfNeeded(); + RoleEntry *entry; + HASH_SEQ_STATUS status; + + hash_seq_init(&status, old_table->role_table); + while ((entry = hash_seq_search(&status)) != NULL) + { + RoleEntry *to_write = hash_search( + CurrentDdlTable->role_table, + entry->name, + HASH_ENTER, + NULL); + + to_write->type = entry->type; + if (entry->password) + to_write->password = entry->password; + strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); + if (entry->old_name[0] != '\0') + { + bool found_old = false; + RoleEntry *old = hash_search( + CurrentDdlTable->role_table, + entry->old_name, + HASH_FIND, + &found_old); + + if (found_old) + { + if (old->old_name[0] != '\0') + strlcpy(to_write->old_name, old->old_name, NAMEDATALEN); + else + strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); + hash_search(CurrentDdlTable->role_table, + entry->old_name, + HASH_REMOVE, + NULL); + } + } + } + hash_destroy(old_table->role_table); + } +} + +static void +PopTable() +{ + /* + * Current table gets freed because it is allocated in aborted + * subtransaction's memory context. + */ + CurrentDdlTable = CurrentDdlTable->prev_table; +} + +static void +NeonSubXactCallback( + SubXactEvent event, + SubTransactionId mySubid, + SubTransactionId parentSubid, + void *arg) +{ + switch (event) + { + case SUBXACT_EVENT_START_SUB: + return PushTable(); + case SUBXACT_EVENT_COMMIT_SUB: + return MergeTable(); + case SUBXACT_EVENT_ABORT_SUB: + return PopTable(); + default: + return; + } +} + +static void +NeonXactCallback(XactEvent event, void *arg) +{ + if (event == XACT_EVENT_PRE_COMMIT || event == XACT_EVENT_PARALLEL_PRE_COMMIT) + { + SendDeltasToControlPlane(); + } + RootTable.role_table = NULL; + RootTable.db_table = NULL; + Assert(CurrentDdlTable == &RootTable); +} + +static void +HandleCreateDb(CreatedbStmt *stmt) +{ + InitDbTableIfNeeded(); + DefElem *downer = NULL; + ListCell *option; + + foreach(option, stmt->options) + { + DefElem *defel = lfirst(option); + + if (strcmp(defel->defname, "owner") == 0) + downer = defel; + } + bool found = false; + DbEntry *entry = hash_search( + CurrentDdlTable->db_table, + stmt->dbname, + HASH_ENTER, + &found); + + if (!found) + memset(entry->old_name, 0, sizeof(entry->old_name)); + + entry->type = Op_Set; + if (downer && downer->arg) + entry->owner = get_role_oid(defGetString(downer), false); + else + entry->owner = GetUserId(); +} + +static void +HandleAlterOwner(AlterOwnerStmt *stmt) +{ + if (stmt->objectType != OBJECT_DATABASE) + return; + InitDbTableIfNeeded(); + const char *name = strVal(stmt->object); + bool found = false; + DbEntry *entry = hash_search( + CurrentDdlTable->db_table, + name, + HASH_ENTER, + &found); + + if (!found) + memset(entry->old_name, 0, sizeof(entry->old_name)); + + entry->owner = get_role_oid(get_rolespec_name(stmt->newowner), false); + entry->type = Op_Set; +} + +static void +HandleDbRename(RenameStmt *stmt) +{ + Assert(stmt->renameType == OBJECT_DATABASE); + InitDbTableIfNeeded(); + bool found = false; + DbEntry *entry = hash_search( + CurrentDdlTable->db_table, + stmt->subname, + HASH_FIND, + &found); + DbEntry *entry_for_new_name = hash_search( + CurrentDdlTable->db_table, + stmt->newname, + HASH_ENTER, + NULL); + + entry_for_new_name->type = Op_Set; + if (found) + { + if (entry->old_name[0] != '\0') + strlcpy(entry_for_new_name->old_name, entry->old_name, NAMEDATALEN); + else + strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN); + entry_for_new_name->owner = entry->owner; + hash_search( + CurrentDdlTable->db_table, + stmt->subname, + HASH_REMOVE, + NULL); + } + else + { + strlcpy(entry_for_new_name->old_name, stmt->subname, NAMEDATALEN); + entry_for_new_name->owner = InvalidOid; + } +} + +static void +HandleDropDb(DropdbStmt *stmt) +{ + InitDbTableIfNeeded(); + bool found = false; + DbEntry *entry = hash_search( + CurrentDdlTable->db_table, + stmt->dbname, + HASH_ENTER, + &found); + + entry->type = Op_Delete; + entry->owner = InvalidOid; + if (!found) + memset(entry->old_name, 0, sizeof(entry->old_name)); +} + +static void +HandleCreateRole(CreateRoleStmt *stmt) +{ + InitRoleTableIfNeeded(); + bool found = false; + RoleEntry *entry = hash_search( + CurrentDdlTable->role_table, + stmt->role, + HASH_ENTER, + &found); + DefElem *dpass = NULL; + ListCell *option; + + foreach(option, stmt->options) + { + DefElem *defel = lfirst(option); + + if (strcmp(defel->defname, "password") == 0) + dpass = defel; + } + if (!found) + memset(entry->old_name, 0, sizeof(entry->old_name)); + if (dpass && dpass->arg) + entry->password = MemoryContextStrdup(CurTransactionContext, strVal(dpass->arg)); + else + entry->password = NULL; + entry->type = Op_Set; +} + +static void +HandleAlterRole(AlterRoleStmt *stmt) +{ + InitRoleTableIfNeeded(); + DefElem *dpass = NULL; + ListCell *option; + + foreach(option, stmt->options) + { + DefElem *defel = lfirst(option); + + if (strcmp(defel->defname, "password") == 0) + dpass = defel; + } + /* We only care about updates to the password */ + if (!dpass) + return; + bool found = false; + RoleEntry *entry = hash_search( + CurrentDdlTable->role_table, + stmt->role->rolename, + HASH_ENTER, + &found); + + if (!found) + memset(entry->old_name, 0, sizeof(entry->old_name)); + if (dpass->arg) + entry->password = MemoryContextStrdup(CurTransactionContext, strVal(dpass->arg)); + else + entry->password = NULL; + entry->type = Op_Set; +} + +static void +HandleRoleRename(RenameStmt *stmt) +{ + InitRoleTableIfNeeded(); + Assert(stmt->renameType == OBJECT_ROLE); + bool found = false; + RoleEntry *entry = hash_search( + CurrentDdlTable->role_table, + stmt->subname, + HASH_FIND, + &found); + + RoleEntry *entry_for_new_name = hash_search( + CurrentDdlTable->role_table, + stmt->newname, + HASH_ENTER, + NULL); + + entry_for_new_name->type = Op_Set; + if (found) + { + if (entry->old_name[0] != '\0') + strlcpy(entry_for_new_name->old_name, entry->old_name, NAMEDATALEN); + else + strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN); + entry_for_new_name->password = entry->password; + hash_search( + CurrentDdlTable->role_table, + entry->name, + HASH_REMOVE, + NULL); + } + else + { + strlcpy(entry_for_new_name->old_name, stmt->subname, NAMEDATALEN); + entry_for_new_name->password = NULL; + } +} + +static void +HandleDropRole(DropRoleStmt *stmt) +{ + InitRoleTableIfNeeded(); + ListCell *item; + + foreach(item, stmt->roles) + { + RoleSpec *spec = lfirst(item); + bool found = false; + RoleEntry *entry = hash_search( + CurrentDdlTable->role_table, + spec->rolename, + HASH_ENTER, + &found); + + entry->type = Op_Delete; + entry->password = NULL; + if (!found) + memset(entry->old_name, 0, sizeof(entry)); + } +} + +static void +HandleRename(RenameStmt *stmt) +{ + if (stmt->renameType == OBJECT_DATABASE) + return HandleDbRename(stmt); + else if (stmt->renameType == OBJECT_ROLE) + return HandleRoleRename(stmt); +} + +static void +NeonProcessUtility( + PlannedStmt *pstmt, + const char *queryString, + bool readOnlyTree, + ProcessUtilityContext context, + ParamListInfo params, + QueryEnvironment *queryEnv, + DestReceiver *dest, + QueryCompletion *qc) +{ + Node *parseTree = pstmt->utilityStmt; + + switch (nodeTag(parseTree)) + { + case T_CreatedbStmt: + HandleCreateDb(castNode(CreatedbStmt, parseTree)); + break; + case T_AlterOwnerStmt: + HandleAlterOwner(castNode(AlterOwnerStmt, parseTree)); + break; + case T_RenameStmt: + HandleRename(castNode(RenameStmt, parseTree)); + break; + case T_DropdbStmt: + HandleDropDb(castNode(DropdbStmt, parseTree)); + break; + case T_CreateRoleStmt: + HandleCreateRole(castNode(CreateRoleStmt, parseTree)); + break; + case T_AlterRoleStmt: + HandleAlterRole(castNode(AlterRoleStmt, parseTree)); + break; + case T_DropRoleStmt: + HandleDropRole(castNode(DropRoleStmt, parseTree)); + break; + default: + break; + } + + if (PreviousProcessUtilityHook) + { + PreviousProcessUtilityHook( + pstmt, + queryString, + readOnlyTree, + context, + params, + queryEnv, + dest, + qc); + } + else + { + standard_ProcessUtility( + pstmt, + queryString, + readOnlyTree, + context, + params, + queryEnv, + dest, + qc); + } +} + +extern void +InitControlPlaneConnector() +{ + PreviousProcessUtilityHook = ProcessUtility_hook; + ProcessUtility_hook = NeonProcessUtility; + RegisterXactCallback(NeonXactCallback, NULL); + RegisterSubXactCallback(NeonSubXactCallback, NULL); + + DefineCustomStringVariable( + "neon.console_url", + "URL of the Neon Console, which will be forwarded changes to dbs and roles", + NULL, + &ConsoleURL, + NULL, + PGC_POSTMASTER, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable( + "neon.forward_ddl", + "Controls whether to forward DDL to the control plane", + NULL, + &ForwardDDL, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + const char *jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN"); + + if (!jwt_token) + { + elog(LOG, "Missing NEON_CONTROL_PLANE_TOKEN environment variable, forwarding will not be authenticated"); + } + + if (curl_global_init(CURL_GLOBAL_DEFAULT)) + { + elog(ERROR, "Failed to initialize curl"); + } + if ((CurlHandle = curl_easy_init()) == NULL) + { + elog(ERROR, "Failed to initialize curl handle"); + } + if ((ContentHeader = curl_slist_append(ContentHeader, "Content-Type: application/json")) == NULL) + { + elog(ERROR, "Failed to initialize content header"); + } + + if (jwt_token) + { + char auth_header[8192]; + + snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token); + if ((ContentHeader = curl_slist_append(ContentHeader, auth_header)) == NULL) + { + elog(ERROR, "Failed to initialize authorization header"); + } + } +} diff --git a/pgxn/neon/control_plane_connector.h b/pgxn/neon/control_plane_connector.h new file mode 100644 index 0000000000..12d6a97562 --- /dev/null +++ b/pgxn/neon/control_plane_connector.h @@ -0,0 +1,6 @@ +#ifndef CONTROL_PLANE_CONNECTOR_H +#define CONTROL_PLANE_CONNECTOR_H + +void InitControlPlaneConnector(); + +#endif diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 217c1974a0..b45d7cfc32 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -25,6 +25,7 @@ #include "neon.h" #include "walproposer.h" #include "pagestore_client.h" +#include "control_plane_connector.h" PG_MODULE_MAGIC; void _PG_init(void); @@ -34,7 +35,11 @@ _PG_init(void) { pg_init_libpagestore(); pg_init_walproposer(); + InitControlPlaneConnector(); + // Important: This must happen after other parts of the extension + // are loaded, otherwise any settings to GUCs that were set before + // the extension was loaded will be removed. EmitWarningsOnPlaceholders("neon"); } diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index a99be40955..64d980d2e4 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -254,20 +254,20 @@ nwp_register_gucs(void) DefineCustomIntVariable( "neon.safekeeper_reconnect_timeout", - "Timeout for reconnecting to offline wal acceptor.", + "Walproposer reconnects to offline safekeepers once in this interval.", NULL, &wal_acceptor_reconnect_timeout, - 1000, 0, INT_MAX, /* default, min, max */ + 5000, 0, INT_MAX, /* default, min, max */ PGC_SIGHUP, /* context */ GUC_UNIT_MS, /* flags */ NULL, NULL, NULL); DefineCustomIntVariable( "neon.safekeeper_connect_timeout", - "Timeout for connection establishement and it's maintenance against safekeeper", + "Connection or connection attempt to safekeeper is terminated if no message is received (or connection attempt doesn't finish) within this period.", NULL, &wal_acceptor_connection_timeout, - 5000, 0, INT_MAX, + 10000, 0, INT_MAX, PGC_SIGHUP, GUC_UNIT_MS, NULL, NULL, NULL); @@ -441,7 +441,7 @@ WalProposerPoll(void) if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now, wal_acceptor_connection_timeout)) { - elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms", + elog(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that", sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout); ShutdownConnection(sk); } @@ -1035,9 +1035,16 @@ RecvAcceptorGreeting(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->greetResponse)) return; + elog(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port); + /* Protocol is all good, move to voting. */ sk->state = SS_VOTING; + /* + * Note: it would be better to track the counter on per safekeeper basis, + * but at worst walproposer would restart with 'term rejected', so leave as + * is for now. + */ ++n_connected; if (n_connected <= quorum) { diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index bba2d51caf..a5f50cc7c1 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -17,7 +17,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::sync::CancellationToken; use utils::{project_git_version, sentry_init::init_sentry}; -use tracing::{error, info, warn}; +use tracing::{error, info, warn, Instrument}; project_git_version!(GIT_VERSION); @@ -141,7 +141,6 @@ async fn task_main( tokio::select! { accept_result = listener.accept() => { let (socket, peer_addr) = accept_result?; - info!("accepted postgres client connection from {peer_addr}"); let session_id = uuid::Uuid::new_v4(); let tls_config = Arc::clone(&tls_config); @@ -149,18 +148,18 @@ async fn task_main( connections.spawn( async move { - info!("spawned a task for {peer_addr}"); - socket .set_nodelay(true) .context("failed to set socket option")?; - handle_client(dest_suffix, tls_config, session_id, socket).await + info!(%peer_addr, "serving"); + handle_client(dest_suffix, tls_config, socket).await } .unwrap_or_else(|e| { // Acknowledge that the task has finished with an error. error!("per-client task finished with an error: {e:#}"); - }), + }) + .instrument(tracing::info_span!("handle_client", ?session_id)) ); } _ = cancellation_token.cancelled() => { @@ -192,7 +191,6 @@ async fn ssl_handshake( let mut stream = PqStream::new(Stream::from_raw(raw_stream)); let msg = stream.read_startup_packet().await?; - info!("received {msg:?}"); use pq_proto::FeStartupPacket::*; match msg { @@ -215,15 +213,19 @@ async fn ssl_handshake( } Ok(raw.upgrade(tls_config).await?) } - _ => stream.throw_error_str(ERR_INSECURE_CONNECTION).await?, + unexpected => { + info!( + ?unexpected, + "unexpected startup packet, rejecting connection" + ); + stream.throw_error_str(ERR_INSECURE_CONNECTION).await? + } } } -#[tracing::instrument(fields(session_id = ?session_id), skip_all)] async fn handle_client( dest_suffix: Arc, tls_config: Arc, - session_id: uuid::Uuid, stream: impl AsyncRead + AsyncWrite + Unpin, ) -> anyhow::Result<()> { let tls_stream = ssl_handshake(stream, tls_config).await?; diff --git a/proxy/src/console.rs b/proxy/src/console.rs index 1f3ef99555..0e5eaaf845 100644 --- a/proxy/src/console.rs +++ b/proxy/src/console.rs @@ -1,5 +1,5 @@ -///! Various stuff for dealing with the Neon Console. -///! Later we might move some API wrappers here. +//! Various stuff for dealing with the Neon Console. +//! Later we might move some API wrappers here. /// Payloads used in the console's APIs. pub mod messages; diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 60acb588dc..3373c49676 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -1,4 +1,4 @@ -///! A group of high-level tests for connection establishing logic and auth. +//! A group of high-level tests for connection establishing logic and auth. use super::*; use crate::{auth, sasl, scram}; use async_trait::async_trait; diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 5e25d22ec1..48c56ee58f 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -19,8 +19,10 @@ use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; use tracing::*; +use crate::metrics::BROKER_ITERATION_TIMELINES; use crate::metrics::BROKER_PULLED_UPDATES; use crate::metrics::BROKER_PUSHED_UPDATES; +use crate::metrics::BROKER_PUSH_ALL_UPDATES_SECONDS; use crate::GlobalTimelines; use crate::SafeKeeperConf; @@ -61,8 +63,14 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { BROKER_PUSHED_UPDATES.inc(); } let elapsed = now.elapsed(); - // Log duration every second. Should be about 10MB of logs per day. - info!("pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed); + + BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64()); + BROKER_ITERATION_TIMELINES.observe(active_tlis.len() as f64); + + if elapsed > push_interval / 2 { + info!("broker push is too long, pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed); + } + sleep(push_interval).await; } }; diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 189af2b044..235a88501d 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -125,6 +125,25 @@ pub static BACKUP_ERRORS: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_backup_errors_total counter") }); +pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_broker_push_update_seconds", + "Seconds to push all timeline updates to the broker", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_broker_push_update_seconds histogram vec") +}); +pub const TIMELINES_COUNT_BUCKETS: &[f64] = &[ + 1.0, 10.0, 50.0, 100.0, 200.0, 500.0, 1000.0, 2000.0, 5000.0, 10000.0, 20000.0, 50000.0, +]; +pub static BROKER_ITERATION_TIMELINES: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_broker_iteration_timelines", + "Count of timelines pushed to the broker in a single iteration", + TIMELINES_COUNT_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_broker_iteration_timelines histogram vec") +}); pub const LABEL_UNKNOWN: &str = "unknown"; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 33da0c8e5a..eb434136d4 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -634,7 +634,8 @@ where } // system_id will be updated on mismatch - if self.state.server.system_id != msg.system_id { + // sync-safekeepers doesn't know sysid and sends 0, ignore it + if self.state.server.system_id != msg.system_id && msg.system_id != 0 { if self.state.server.system_id != 0 { warn!( "unexpected system ID arrived, got {}, expected {}", diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 1b82bd754e..644c956fc1 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -379,6 +379,12 @@ impl Storage for PhysicalStorage { ); } + // Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on + // disk (this happens on each connect). + if end_pos == self.write_lsn { + return Ok(()); + } + // Close previously opened file, if any if let Some(mut unflushed_file) = self.file.take() { self.fdatasync_file(&mut unflushed_file)?; diff --git a/scripts/pr-comment-test-report.js b/scripts/comment-test-report.js similarity index 85% rename from scripts/pr-comment-test-report.js rename to scripts/comment-test-report.js index 3a7bba0daa..a7fd5b0bef 100644 --- a/scripts/pr-comment-test-report.js +++ b/scripts/comment-test-report.js @@ -1,5 +1,5 @@ // -// The script parses Allure reports and posts a comment with a summary of the test results to the PR. +// The script parses Allure reports and posts a comment with a summary of the test results to the PR or to the latest commit in the branch. // // The comment is updated on each run with the latest results. // @@ -7,7 +7,7 @@ // - uses: actions/github-script@v6 // with: // script: | -// const script = require("./scripts/pr-comment-test-report.js") +// const script = require("./scripts/comment-test-report.js") // await script({ // github, // context, @@ -35,8 +35,12 @@ class DefaultMap extends Map { module.exports = async ({ github, context, fetch, report }) => { // Marker to find the comment in the subsequent runs const startMarker = `` + // If we run the script in the PR or in the branch (main/release/...) + const isPullRequest = !!context.payload.pull_request + // Latest commit in PR or in the branch + const commitSha = isPullRequest ? context.payload.pull_request.head.sha : context.sha // Let users know that the comment is updated automatically - const autoupdateNotice = `
The comment gets automatically updated with the latest test results
${context.payload.pull_request.head.sha} at ${new Date().toISOString()} :recycle:
` + const autoupdateNotice = `
The comment gets automatically updated with the latest test results
${commitSha} at ${new Date().toISOString()} :recycle:
` // GitHub bot id taken from (https://api.github.com/users/github-actions[bot]) const githubActionsBotId = 41898282 // Commend body itself @@ -166,22 +170,39 @@ module.exports = async ({ github, context, fetch, report }) => { commentBody += autoupdateNotice - const { data: comments } = await github.rest.issues.listComments({ - issue_number: context.payload.number, + let createCommentFn, listCommentsFn, updateCommentFn, issueNumberOrSha + if (isPullRequest) { + createCommentFn = github.rest.issues.createComment + listCommentsFn = github.rest.issues.listComments + updateCommentFn = github.rest.issues.updateComment + issueNumberOrSha = { + issue_number: context.payload.number, + } + } else { + updateCommentFn = github.rest.repos.updateCommitComment + listCommentsFn = github.rest.repos.listCommentsForCommit + createCommentFn = github.rest.repos.createCommitComment + issueNumberOrSha = { + commit_sha: commitSha, + } + } + + const { data: comments } = await listCommentsFn({ + ...issueNumberOrSha, ...ownerRepoParams, }) const comment = comments.find(comment => comment.user.id === githubActionsBotId && comment.body.startsWith(startMarker)) if (comment) { - await github.rest.issues.updateComment({ + await updateCommentFn({ comment_id: comment.id, body: commentBody, ...ownerRepoParams, }) } else { - await github.rest.issues.createComment({ - issue_number: context.payload.number, + await createCommentFn({ body: commentBody, + ...issueNumberOrSha, ...ownerRepoParams, }) } diff --git a/scripts/coverage b/scripts/coverage index 1dc92e57cc..52a69c93b9 100755 --- a/scripts/coverage +++ b/scripts/coverage @@ -156,7 +156,9 @@ class LLVM: profdata: Path, objects: List[str], sources: List[str], - demangler: Optional[Path] = None) -> None: + demangler: Optional[Path] = None, + output_file: Optional[Path] = None, + ) -> None: cwd = self.cargo.cwd objects = list(intersperse('-object', objects)) @@ -180,14 +182,18 @@ class LLVM: *objects, *sources, ] - subprocess.check_call(cmd, cwd=cwd) + if output_file is not None: + with output_file.open('w') as outfile: + subprocess.check_call(cmd, cwd=cwd, stdout=outfile) + else: + subprocess.check_call(cmd, cwd=cwd) def cov_report(self, **kwargs) -> None: self._cov(subcommand='report', **kwargs) - def cov_export(self, *, kind: str, **kwargs) -> None: + def cov_export(self, *, kind: str, output_file: Optional[Path], **kwargs) -> None: extras = (f'-format={kind}', ) - self._cov(subcommand='export', *extras, **kwargs) + self._cov(subcommand='export', *extras, output_file=output_file, **kwargs) def cov_show(self, *, kind: str, output_dir: Optional[Path] = None, **kwargs) -> None: extras = [f'-format={kind}'] @@ -283,9 +289,12 @@ class TextReport(Report): self.llvm.cov_show(kind='text', **self._common_kwargs()) +@dataclass class LcovReport(Report): + output_file: Path + def generate(self) -> None: - self.llvm.cov_export(kind='lcov', **self._common_kwargs()) + self.llvm.cov_export(kind='lcov', output_file=self.output_file, **self._common_kwargs()) @dataclass @@ -475,7 +484,7 @@ class State: 'text': lambda: TextReport(**params), 'lcov': - lambda: LcovReport(**params), + lambda: LcovReport(**params, output_file=self.report_dir / 'lcov.info'), 'summary': lambda: SummaryReport(**params), 'github': diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 4b599ce9b6..d95878b341 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -535,8 +535,8 @@ def export_timeline( def main(args: argparse.Namespace): - # any psql version will do here. use current DEFAULT_PG_VERSION = 14 - psql_path = str(Path(args.pg_distrib_dir) / "v14" / "bin" / "psql") + # any psql version will do here. use current DEFAULT_PG_VERSION = 15 + psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql") old_pageserver_host = args.old_pageserver_host new_pageserver_host = args.new_pageserver_host diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index 8441aaf625..4bc561449d 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -40,6 +40,9 @@ pub type BrokerClientChannel = BrokerServiceClient; // Create connection object configured to run TLS if schema starts with https:// // and plain text otherwise. Connection is lazy, only endpoint sanity is // validated here. +// +// NB: this function is not async, but still must be run on a tokio runtime thread +// because that's a requirement of tonic_endpoint.connect_lazy()'s Channel::new call. pub fn connect(endpoint: U, keepalive_interval: Duration) -> anyhow::Result where U: std::convert::TryInto, diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 0e958ddd06..b4c237cfa6 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -65,12 +65,19 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_getpage_reconstruct_seconds_bucket", "pageserver_getpage_reconstruct_seconds_count", "pageserver_getpage_reconstruct_seconds_sum", + "pageserver_getpage_get_reconstruct_data_seconds_bucket", + "pageserver_getpage_get_reconstruct_data_seconds_count", + "pageserver_getpage_get_reconstruct_data_seconds_sum", "pageserver_io_operations_bytes_total", "pageserver_io_operations_seconds_bucket", "pageserver_io_operations_seconds_count", "pageserver_io_operations_seconds_sum", "pageserver_last_record_lsn", "pageserver_materialized_cache_hits_total", + "pageserver_materialized_cache_hits_direct_total", + "pageserver_read_num_fs_layers_bucket", + "pageserver_read_num_fs_layers_count", + "pageserver_read_num_fs_layers_sum", "pageserver_smgr_query_seconds_bucket", "pageserver_smgr_query_seconds_count", "pageserver_smgr_query_seconds_sum", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 59afc104e6..551faa116e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -629,7 +629,7 @@ class NeonEnvBuilder: assert self.env is not None, "environment is not already initialized, call init() first" self.env.start() - def init_start(self) -> NeonEnv: + def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> NeonEnv: env = self.init_configs() self.start() @@ -638,7 +638,9 @@ class NeonEnvBuilder: log.info( f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline" ) - initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant) + initial_tenant, initial_timeline = env.neon_cli.create_tenant( + tenant_id=env.initial_tenant, conf=initial_tenant_conf + ) env.initial_timeline = initial_timeline log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully") @@ -1444,11 +1446,12 @@ class NeonCli(AbstractNeonCli): def endpoint_create( self, branch_name: str, + pg_port: int, + http_port: int, endpoint_id: Optional[str] = None, tenant_id: Optional[TenantId] = None, hot_standby: bool = False, lsn: Optional[Lsn] = None, - port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1462,8 +1465,10 @@ class NeonCli(AbstractNeonCli): ] if lsn is not None: args.extend(["--lsn", str(lsn)]) - if port is not None: - args.extend(["--port", str(port)]) + if pg_port is not None: + args.extend(["--pg-port", str(pg_port)]) + if http_port is not None: + args.extend(["--http-port", str(http_port)]) if endpoint_id is not None: args.append(endpoint_id) if hot_standby: @@ -1476,9 +1481,11 @@ class NeonCli(AbstractNeonCli): def endpoint_start( self, endpoint_id: str, + pg_port: int, + http_port: int, + safekeepers: Optional[List[int]] = None, tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, - port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1490,8 +1497,10 @@ class NeonCli(AbstractNeonCli): ] if lsn is not None: args.append(f"--lsn={lsn}") - if port is not None: - args.append(f"--port={port}") + args.extend(["--pg-port", str(pg_port)]) + args.extend(["--http-port", str(http_port)]) + if safekeepers is not None: + args.extend(["--safekeepers", (",".join(map(str, safekeepers)))]) if endpoint_id is not None: args.append(endpoint_id) @@ -1583,13 +1592,11 @@ class NeonPageserver(PgProtocol): ".*serving compute connection task.*exited with error: Postgres connection error.*", ".*serving compute connection task.*exited with error: Connection reset by peer.*", ".*serving compute connection task.*exited with error: Postgres query error.*", - ".*Connection aborted: connection error: error communicating with the server: Broken pipe.*", - ".*Connection aborted: connection error: error communicating with the server: Transport endpoint is not connected.*", - ".*Connection aborted: connection error: error communicating with the server: Connection reset by peer.*", + ".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*", # FIXME: replication patch for tokio_postgres regards any but CopyDone/CopyData message in CopyBoth stream as unexpected - ".*Connection aborted: connection error: unexpected message from server*", + ".*Connection aborted: unexpected message from server*", ".*kill_and_wait_impl.*: wait successful.*", - ".*Replication stream finished: db error:.*ending streaming to Some*", + ".*: db error:.*ending streaming to Some.*", ".*query handler for 'pagestream.*failed: Broken pipe.*", # pageserver notices compute shut down ".*query handler for 'pagestream.*failed: Connection reset by peer.*", # pageserver notices compute shut down # safekeeper connection can fail with this, in the window between timeline creation @@ -1603,24 +1610,25 @@ class NeonPageserver(PgProtocol): # https://github.com/neondatabase/neon/issues/2442 ".*could not remove ephemeral file.*No such file or directory.*", # FIXME: These need investigation - ".*gc_loop.*Failed to get a tenant .* Tenant .* not found.*", - ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found.*", ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*", ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*", ".*Removing intermediate uninit mark file.*", - # FIXME: known race condition in TaskHandle: https://github.com/neondatabase/neon/issues/2885 - ".*sender is dropped while join handle is still alive.*", # Tenant::delete_timeline() can cause any of the four following errors. # FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946 ".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed ".*wait for layer upload ops to complete.*", # .*Caused by:.*wait_completion aborted because upload queue was stopped ".*gc_loop.*Gc failed, retrying in.*timeline is Stopping", # When gc checks timeline state after acquiring layer_removal_cs + ".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant", # Tenant::gc precondition ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs ".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock() ".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress ".*task iteration took longer than the configured period.*", # this is until #3501 ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant", + # these can happen anytime we do compactions from background task and shutdown pageserver + r".*ERROR.*ancestor timeline \S+ is being stopped", + # this is expected given our collaborative shutdown approach for the UploadQueue + ".*Compaction failed, retrying in .*: queue is in state Stopped.*", ] def start( @@ -1688,6 +1696,9 @@ class NeonPageserver(PgProtocol): else: errors.append(line) + for error in errors: + log.info(f"not allowed error: {error.strip()}") + assert not errors def log_contains(self, pattern: str) -> Optional[str]: @@ -2280,17 +2291,24 @@ class Endpoint(PgProtocol): """An object representing a Postgres compute endpoint managed by the control plane.""" def __init__( - self, env: NeonEnv, tenant_id: TenantId, port: int, check_stop_result: bool = True + self, + env: NeonEnv, + tenant_id: TenantId, + pg_port: int, + http_port: int, + check_stop_result: bool = True, ): - super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres") + super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres") self.env = env self.running = False self.branch_name: Optional[str] = None # dubious self.endpoint_id: Optional[str] = None # dubious, see asserts below self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA self.tenant_id = tenant_id - self.port = port + self.pg_port = pg_port + self.http_port = http_port self.check_stop_result = check_stop_result + self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers)) # path to conf is /endpoints//pgdata/postgresql.conf def create( @@ -2320,7 +2338,8 @@ class Endpoint(PgProtocol): tenant_id=self.tenant_id, lsn=lsn, hot_standby=hot_standby, - port=self.port, + pg_port=self.pg_port, + http_port=self.http_port, ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = os.path.join(self.env.repo_dir, path) @@ -2345,7 +2364,13 @@ class Endpoint(PgProtocol): log.info(f"Starting postgres endpoint {self.endpoint_id}") - self.env.neon_cli.endpoint_start(self.endpoint_id, tenant_id=self.tenant_id, port=self.port) + self.env.neon_cli.endpoint_start( + self.endpoint_id, + pg_port=self.pg_port, + http_port=self.http_port, + tenant_id=self.tenant_id, + safekeepers=self.active_safekeepers, + ) self.running = True return self @@ -2369,32 +2394,8 @@ class Endpoint(PgProtocol): return os.path.join(self.pg_data_dir_path(), "pg_twophase") def config_file_path(self) -> str: - """Path to postgresql.conf""" - return os.path.join(self.pg_data_dir_path(), "postgresql.conf") - - def adjust_for_safekeepers(self, safekeepers: str) -> "Endpoint": - """ - Adjust instance config for working with wal acceptors instead of - pageserver (pre-configured by CLI) directly. - """ - - # TODO: reuse config() - with open(self.config_file_path(), "r") as f: - cfg_lines = f.readlines() - with open(self.config_file_path(), "w") as f: - for cfg_line in cfg_lines: - # walproposer uses different application_name - if ( - "synchronous_standby_names" in cfg_line - or - # don't repeat safekeepers/wal_acceptors multiple times - "neon.safekeepers" in cfg_line - ): - continue - f.write(cfg_line) - f.write("synchronous_standby_names = 'walproposer'\n") - f.write("neon.safekeepers = '{}'\n".format(safekeepers)) - return self + """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)""" + return os.path.join(self.endpoint_path(), "postgresql.conf") def config(self, lines: List[str]) -> "Endpoint": """ @@ -2499,7 +2500,8 @@ class EndpointFactory: ep = Endpoint( self.env, tenant_id=tenant_id or self.env.initial_tenant, - port=self.env.port_distributor.get_port(), + pg_port=self.env.port_distributor.get_port(), + http_port=self.env.port_distributor.get_port(), ) self.num_instances += 1 self.endpoints.append(ep) @@ -2524,7 +2526,8 @@ class EndpointFactory: ep = Endpoint( self.env, tenant_id=tenant_id or self.env.initial_tenant, - port=self.env.port_distributor.get_port(), + pg_port=self.env.port_distributor.get_port(), + http_port=self.env.port_distributor.get_port(), ) if endpoint_id is None: @@ -2907,6 +2910,7 @@ SKIP_FILES = frozenset( "pg_internal.init", "pg.log", "zenith.signal", + "pg_hba.conf", "postgresql.conf", "postmaster.opts", "postmaster.pid", diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 1272047881..f258a3a24d 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -155,14 +155,14 @@ class PageserverHttpClient(requests.Session): return res_json def tenant_create( - self, new_tenant_id: Optional[TenantId] = None, conf: Optional[Dict[str, Any]] = None + self, new_tenant_id: TenantId, conf: Optional[Dict[str, Any]] = None ) -> TenantId: if conf is not None: assert "new_tenant_id" not in conf.keys() res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ - "new_tenant_id": str(new_tenant_id) if new_tenant_id else None, + "new_tenant_id": str(new_tenant_id), **(conf or {}), }, ) @@ -293,13 +293,13 @@ class PageserverHttpClient(requests.Session): self, pg_version: PgVersion, tenant_id: TenantId, - new_timeline_id: Optional[TimelineId] = None, + new_timeline_id: TimelineId, ancestor_timeline_id: Optional[TimelineId] = None, ancestor_start_lsn: Optional[Lsn] = None, **kwargs, ) -> Dict[Any, Any]: body: Dict[str, Any] = { - "new_timeline_id": str(new_timeline_id) if new_timeline_id else None, + "new_timeline_id": str(new_timeline_id), "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, } diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index d67f088365..14ae88cc2c 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -27,6 +27,10 @@ class PgVersion(str, enum.Enum): def __repr__(self) -> str: return f"'{self.value}'" + # Make this explicit for Python 3.11 compatibility, which changes the behavior of enums + def __str__(self) -> str: + return self.value + # In GitHub workflows we use Postgres version with v-prefix (e.g. v14 instead of just 14), # sometime we need to do so in tests. @property @@ -78,11 +82,11 @@ def pytest_addoption(parser: Parser): @pytest.fixture(scope="session") def pg_version(request: FixtureRequest) -> Iterator[PgVersion]: if v := request.config.getoption("--pg-version"): - version, source = v, "from --pg-version commad-line argument" + version, source = v, "from --pg-version command-line argument" elif v := os.environ.get("DEFAULT_PG_VERSION"): version, source = PgVersion(v), "from DEFAULT_PG_VERSION environment variable" else: - version, source = DEFAULT_VERSION, "default verson" + version, source = DEFAULT_VERSION, "default version" log.info(f"pg_version is {version} ({source})") yield version diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index eb2ba3e9ed..4df5ae18d6 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -20,6 +20,11 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv: test_name="test_attach_tenant_config", ) env = neon_env_builder.init_start() + + # eviction might be the first one after an attach to access the layers + env.pageserver.allowed_errors.append( + ".*unexpectedly on-demand downloading remote layer remote.* for task kind Eviction" + ) assert isinstance(env.remote_storage, LocalFsStorage) return env @@ -158,6 +163,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "threshold": "23h", }, "evictions_low_residence_duration_metric_threshold": "2days", + "gc_feedback": True, "gc_horizon": 23 * (1024 * 1024), "gc_period": "2h 13m", "image_creation_threshold": 7, diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 3e4a0bfbbb..fb79748832 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -3,7 +3,7 @@ from contextlib import closing import pytest from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol from fixtures.pageserver.http import PageserverApiException -from fixtures.types import TenantId +from fixtures.types import TenantId, TimelineId def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): @@ -25,21 +25,19 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): ps.safe_psql("set FOO", password=tenant_token) ps.safe_psql("set FOO", password=pageserver_token) - new_timeline_id = env.neon_cli.create_branch( - "test_pageserver_auth", tenant_id=env.initial_tenant - ) - # tenant can create branches tenant_http_client.timeline_create( pg_version=env.pg_version, tenant_id=env.initial_tenant, - ancestor_timeline_id=new_timeline_id, + new_timeline_id=TimelineId.generate(), + ancestor_timeline_id=env.initial_timeline, ) # console can create branches for tenant pageserver_http_client.timeline_create( pg_version=env.pg_version, tenant_id=env.initial_tenant, - ancestor_timeline_id=new_timeline_id, + new_timeline_id=TimelineId.generate(), + ancestor_timeline_id=env.initial_timeline, ) # fail to create branch using token with different tenant_id @@ -49,18 +47,19 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): invalid_tenant_http_client.timeline_create( pg_version=env.pg_version, tenant_id=env.initial_tenant, - ancestor_timeline_id=new_timeline_id, + new_timeline_id=TimelineId.generate(), + ancestor_timeline_id=env.initial_timeline, ) # create tenant using management token - pageserver_http_client.tenant_create() + pageserver_http_client.tenant_create(TenantId.generate()) # fail to create tenant using tenant token with pytest.raises( PageserverApiException, match="Forbidden: Attempt to access management api with tenant scope. Permission denied", ): - tenant_http_client.tenant_create() + tenant_http_client.tenant_create(TenantId.generate()) def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index fb592bfbc3..0fb3b4f262 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -20,7 +20,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", ".*failed to load metadata.*", - ".*could not load tenant.*load local timeline.*", + ".*load failed.*load local timeline.*", ] ) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index fe8dc293c1..2635dbd93c 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -383,6 +383,9 @@ def check_neon_works( cli_target = NeonCli(config_target) # And the current binaries to launch computes + snapshot_config["neon_distrib_dir"] = str(neon_current_binpath) + with (snapshot_config_toml).open("w") as f: + toml.dump(snapshot_config, f) config_current = copy.copy(config) config_current.neon_binpath = neon_current_binpath cli_current = NeonCli(config_current) @@ -391,7 +394,8 @@ def check_neon_works( request.addfinalizer(lambda: cli_target.raw_cli(["stop"])) pg_port = port_distributor.get_port() - cli_current.endpoint_start("main", port=pg_port) + http_port = port_distributor.get_port() + cli_current.endpoint_start("main", pg_port=pg_port, http_port=http_port) request.addfinalizer(lambda: cli_current.endpoint_stop("main")) connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres" diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py deleted file mode 100644 index d72ffe078d..0000000000 --- a/test_runner/regress/test_compute_ctl.py +++ /dev/null @@ -1,253 +0,0 @@ -import os -from pathlib import Path -from subprocess import TimeoutExpired - -from fixtures.log_helper import log -from fixtures.neon_fixtures import ComputeCtl, NeonEnvBuilder, PgBin - - -# Test that compute_ctl works and prints "--sync-safekeepers" logs. -def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): - neon_env_builder.num_safekeepers = 3 - env = neon_env_builder.init_start() - ctl = ComputeCtl(env) - - env.neon_cli.create_branch("test_compute_ctl", "main") - endpoint = env.endpoints.create_start("test_compute_ctl") - endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)") - - with open(endpoint.config_file_path(), "r") as f: - cfg_lines = f.readlines() - cfg_map = {} - for line in cfg_lines: - if "=" in line: - k, v = line.split("=") - cfg_map[k] = v.strip("\n '\"") - log.info(f"postgres config: {cfg_map}") - pgdata = endpoint.pg_data_dir_path() - pg_bin_path = os.path.join(pg_bin.pg_bin_path, "postgres") - - endpoint.stop_and_destroy() - - # stop_and_destroy removes the whole endpoint directory. Recreate it. - Path(pgdata).mkdir(parents=True) - - spec = ( - """ -{ - "format_version": 1.0, - - "timestamp": "2021-05-23T18:25:43.511Z", - "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b", - - "cluster": { - "cluster_id": "test-cluster-42", - "name": "Neon Test", - "state": "restarted", - "roles": [ - ], - "databases": [ - ], - "settings": [ - { - "name": "fsync", - "value": "off", - "vartype": "bool" - }, - { - "name": "wal_level", - "value": "replica", - "vartype": "enum" - }, - { - "name": "neon.safekeepers", - "value": """ - + f'"{cfg_map["neon.safekeepers"]}"' - + """, - "vartype": "string" - }, - { - "name": "wal_log_hints", - "value": "on", - "vartype": "bool" - }, - { - "name": "log_connections", - "value": "on", - "vartype": "bool" - }, - { - "name": "shared_buffers", - "value": "32768", - "vartype": "integer" - }, - { - "name": "port", - "value": """ - + f'"{cfg_map["port"]}"' - + """, - "vartype": "integer" - }, - { - "name": "max_connections", - "value": "100", - "vartype": "integer" - }, - { - "name": "max_wal_senders", - "value": "10", - "vartype": "integer" - }, - { - "name": "listen_addresses", - "value": "0.0.0.0", - "vartype": "string" - }, - { - "name": "wal_sender_timeout", - "value": "0", - "vartype": "integer" - }, - { - "name": "password_encryption", - "value": "md5", - "vartype": "enum" - }, - { - "name": "maintenance_work_mem", - "value": "65536", - "vartype": "integer" - }, - { - "name": "max_parallel_workers", - "value": "8", - "vartype": "integer" - }, - { - "name": "max_worker_processes", - "value": "8", - "vartype": "integer" - }, - { - "name": "neon.tenant_id", - "value": """ - + f'"{cfg_map["neon.tenant_id"]}"' - + """, - "vartype": "string" - }, - { - "name": "max_replication_slots", - "value": "10", - "vartype": "integer" - }, - { - "name": "neon.timeline_id", - "value": """ - + f'"{cfg_map["neon.timeline_id"]}"' - + """, - "vartype": "string" - }, - { - "name": "shared_preload_libraries", - "value": "neon", - "vartype": "string" - }, - { - "name": "synchronous_standby_names", - "value": "walproposer", - "vartype": "string" - }, - { - "name": "neon.pageserver_connstring", - "value": """ - + f'"{cfg_map["neon.pageserver_connstring"]}"' - + """, - "vartype": "string" - } - ] - }, - "delta_operations": [ - ] -} -""" - ) - - ps_connstr = cfg_map["neon.pageserver_connstring"] - log.info(f"ps_connstr: {ps_connstr}, pgdata: {pgdata}") - - # run compute_ctl and wait for 10s - try: - ctl.raw_cli( - [ - "--connstr", - "postgres://invalid/", - "--pgdata", - pgdata, - "--spec", - spec, - "--pgbin", - pg_bin_path, - ], - timeout=10, - ) - except TimeoutExpired as exc: - ctl_logs = (exc.stderr or b"").decode("utf-8") - log.info(f"compute_ctl stderr:\n{ctl_logs}") - - with ExternalProcessManager(Path(pgdata) / "postmaster.pid"): - start = "starting safekeepers syncing" - end = "safekeepers synced at LSN" - start_pos = ctl_logs.index(start) - assert start_pos != -1 - end_pos = ctl_logs.index(end, start_pos) - assert end_pos != -1 - sync_safekeepers_logs = ctl_logs[start_pos : end_pos + len(end)] - log.info("sync_safekeepers_logs:\n" + sync_safekeepers_logs) - - # assert that --sync-safekeepers logs are present in the output - assert "connecting with node" in sync_safekeepers_logs - assert "connected with node" in sync_safekeepers_logs - assert "proposer connected to quorum (2)" in sync_safekeepers_logs - assert "got votes from majority (2)" in sync_safekeepers_logs - assert "sending elected msg to node" in sync_safekeepers_logs - - -class ExternalProcessManager: - """ - Context manager that kills a process with a pid file on exit. - """ - - def __init__(self, pid_file: Path): - self.path = pid_file - self.pid_file = open(pid_file, "r") - self.pid = int(self.pid_file.readline().strip()) - - def __enter__(self): - return self - - def leave_alive(self): - self.pid_file.close() - - def __exit__(self, _type, _value, _traceback): - import signal - import time - - if self.pid_file.closed: - return - - with self.pid_file: - try: - os.kill(self.pid, signal.SIGTERM) - except OSError as e: - if not self.path.is_file(): - return - log.info(f"Failed to kill {self.pid}, but the pidfile remains: {e}") - return - - for _ in range(20): - if not self.path.is_file(): - return - time.sleep(0.2) - - log.info("Process failed to stop after SIGTERM: {self.pid}") - os.kill(self.pid, signal.SIGKILL) diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py new file mode 100644 index 0000000000..6bfa8fdbe7 --- /dev/null +++ b/test_runner/regress/test_ddl_forwarding.py @@ -0,0 +1,210 @@ +from types import TracebackType +from typing import Any, Dict, List, Optional, Tuple, Type + +import psycopg2 +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import VanillaPostgres +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + + +def handle_db(dbs, roles, operation): + if operation["op"] == "set": + if "old_name" in operation and operation["old_name"] in dbs: + dbs[operation["name"]] = dbs[operation["old_name"]] + dbs.pop(operation["old_name"]) + if "owner" in operation: + dbs[operation["name"]] = operation["owner"] + elif operation["op"] == "del": + dbs.pop(operation["name"]) + else: + raise ValueError("Invalid op") + + +def handle_role(dbs, roles, operation): + if operation["op"] == "set": + if "old_name" in operation and operation["old_name"] in roles: + roles[operation["name"]] = roles[operation["old_name"]] + roles.pop(operation["old_name"]) + for db, owner in dbs.items(): + if owner == operation["old_name"]: + dbs[db] = operation["name"] + if "password" in operation: + roles[operation["name"]] = operation["password"] + elif operation["op"] == "del": + if "old_name" in operation: + roles.pop(operation["old_name"]) + roles.pop(operation["name"]) + else: + raise ValueError("Invalid op") + + +fail = False + + +def ddl_forward_handler(request: Request, dbs: Dict[str, str], roles: Dict[str, str]) -> Response: + log.info(f"Received request with data {request.get_data(as_text=True)}") + if fail: + log.info("FAILING") + return Response(status=500, response="Failed just cuz") + if request.json is None: + log.info("Received invalid JSON") + return Response(status=400) + json = request.json + # Handle roles first + if "roles" in json: + for operation in json["roles"]: + handle_role(dbs, roles, operation) + if "dbs" in json: + for operation in json["dbs"]: + handle_db(dbs, roles, operation) + return Response(status=200) + + +class DdlForwardingContext: + def __init__(self, httpserver: HTTPServer, vanilla_pg: VanillaPostgres, host: str, port: int): + self.server = httpserver + self.pg = vanilla_pg + self.host = host + self.port = port + self.dbs: Dict[str, str] = {} + self.roles: Dict[str, str] = {} + endpoint = "/management/api/v2/roles_and_databases" + ddl_url = f"http://{host}:{port}{endpoint}" + self.pg.configure( + [ + f"neon.console_url={ddl_url}", + "shared_preload_libraries = 'neon'", + ] + ) + log.info(f"Listening on {ddl_url}") + self.server.expect_request(endpoint, method="PATCH").respond_with_handler( + lambda request: ddl_forward_handler(request, self.dbs, self.roles) + ) + + def __enter__(self): + self.pg.start() + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + tb: Optional[TracebackType], + ): + self.pg.stop() + + def send(self, query: str) -> List[Tuple[Any, ...]]: + return self.pg.safe_psql(query) + + def wait(self, timeout=3): + self.server.wait(timeout=timeout) + + def send_and_wait(self, query: str, timeout=3) -> List[Tuple[Any, ...]]: + res = self.send(query) + self.wait(timeout=timeout) + return res + + +@pytest.fixture(scope="function") +def ddl( + httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: tuple[str, int] +): + (host, port) = httpserver_listen_address + with DdlForwardingContext(httpserver, vanilla_pg, host, port) as ddl: + yield ddl + + +def test_ddl_forwarding(ddl: DdlForwardingContext): + curr_user = ddl.send("SELECT current_user")[0][0] + log.info(f"Current user is {curr_user}") + ddl.send_and_wait("CREATE DATABASE bork") + assert ddl.dbs == {"bork": curr_user} + ddl.send_and_wait("CREATE ROLE volk WITH PASSWORD 'nu_zayats'") + ddl.send_and_wait("ALTER DATABASE bork RENAME TO nu_pogodi") + assert ddl.dbs == {"nu_pogodi": curr_user} + ddl.send_and_wait("ALTER DATABASE nu_pogodi OWNER TO volk") + assert ddl.dbs == {"nu_pogodi": "volk"} + ddl.send_and_wait("DROP DATABASE nu_pogodi") + assert ddl.dbs == {} + ddl.send_and_wait("DROP ROLE volk") + assert ddl.roles == {} + + ddl.send_and_wait("CREATE ROLE tarzan WITH PASSWORD 'of_the_apes'") + assert ddl.roles == {"tarzan": "of_the_apes"} + ddl.send_and_wait("DROP ROLE tarzan") + assert ddl.roles == {} + ddl.send_and_wait("CREATE ROLE tarzan WITH PASSWORD 'of_the_apes'") + assert ddl.roles == {"tarzan": "of_the_apes"} + ddl.send_and_wait("ALTER ROLE tarzan WITH PASSWORD 'jungle_man'") + assert ddl.roles == {"tarzan": "jungle_man"} + ddl.send_and_wait("ALTER ROLE tarzan RENAME TO mowgli") + assert ddl.roles == {"mowgli": "jungle_man"} + ddl.send_and_wait("DROP ROLE mowgli") + assert ddl.roles == {} + + conn = ddl.pg.connect() + cur = conn.cursor() + + cur.execute("BEGIN") + cur.execute("CREATE ROLE bork WITH PASSWORD 'cork'") + cur.execute("COMMIT") + ddl.wait() + assert ddl.roles == {"bork": "cork"} + cur.execute("BEGIN") + cur.execute("CREATE ROLE stork WITH PASSWORD 'pork'") + cur.execute("ABORT") + ddl.wait() + assert ("stork", "pork") not in ddl.roles.items() + cur.execute("BEGIN") + cur.execute("ALTER ROLE bork WITH PASSWORD 'pork'") + cur.execute("ALTER ROLE bork RENAME TO stork") + cur.execute("COMMIT") + ddl.wait() + assert ddl.roles == {"stork": "pork"} + cur.execute("BEGIN") + cur.execute("CREATE ROLE dork WITH PASSWORD 'york'") + cur.execute("SAVEPOINT point") + cur.execute("ALTER ROLE dork WITH PASSWORD 'zork'") + cur.execute("ALTER ROLE dork RENAME TO fork") + cur.execute("ROLLBACK TO SAVEPOINT point") + cur.execute("ALTER ROLE dork WITH PASSWORD 'fork'") + cur.execute("ALTER ROLE dork RENAME TO zork") + cur.execute("RELEASE SAVEPOINT point") + cur.execute("COMMIT") + ddl.wait() + assert ddl.roles == {"stork": "pork", "zork": "fork"} + + cur.execute("DROP ROLE stork") + cur.execute("DROP ROLE zork") + ddl.wait() + assert ddl.roles == {} + + cur.execute("CREATE ROLE bork WITH PASSWORD 'dork'") + cur.execute("CREATE ROLE stork WITH PASSWORD 'cork'") + cur.execute("BEGIN") + cur.execute("DROP ROLE bork") + cur.execute("ALTER ROLE stork RENAME TO bork") + cur.execute("COMMIT") + ddl.wait() + assert ddl.roles == {"bork": "cork"} + + cur.execute("DROP ROLE bork") + ddl.wait() + assert ddl.roles == {} + + cur.execute("CREATE ROLE bork WITH PASSWORD 'dork'") + cur.execute("CREATE DATABASE stork WITH OWNER=bork") + cur.execute("ALTER ROLE bork RENAME TO cork") + ddl.wait() + assert ddl.dbs == {"stork": "cork"} + + with pytest.raises(psycopg2.InternalError): + global fail + fail = True + cur.execute("CREATE DATABASE failure WITH OWNER=cork") + ddl.wait() + + conn.close() diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index ab67518092..0ec023b9e1 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -110,6 +110,12 @@ class EvictionEnv: overrides=( "--pageserver-config-override=disk_usage_based_eviction=" + enc.dump_inline_table(disk_usage_config).replace("\n", " "), + # Disk usage based eviction runs as a background task. + # But pageserver startup delays launch of background tasks for some time, to prioritize initial logical size calculations during startup. + # But, initial logical size calculation may not be triggered if safekeepers don't publish new broker messages. + # But, we only have a 10-second-timeout in this test. + # So, disable the delay for this test. + "--pageserver-config-override=background_task_maximum_delay='0s'", ), ) diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py index 00ea77f2e7..12e695bcbd 100644 --- a/test_runner/regress/test_metric_collection.py +++ b/test_runner/regress/test_metric_collection.py @@ -228,7 +228,6 @@ def proxy_with_metric_collector( @pytest.mark.asyncio async def test_proxy_metric_collection( httpserver: HTTPServer, - httpserver_listen_address, proxy_with_metric_collector: NeonProxy, vanilla_pg: VanillaPostgres, ): diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index f6629c54f9..3314e7fbf6 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -9,11 +9,18 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por try: env.neon_cli.start() env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True) - env.neon_cli.endpoint_start(endpoint_id="ep-main", port=port_distributor.get_port()) + + pg_port = port_distributor.get_port() + http_port = port_distributor.get_port() + env.neon_cli.endpoint_start( + endpoint_id="ep-basic-main", pg_port=pg_port, http_port=http_port + ) env.neon_cli.create_branch(new_branch_name="migration_check") + pg_port = port_distributor.get_port() + http_port = port_distributor.get_port() env.neon_cli.endpoint_start( - endpoint_id="ep-migration_check", port=port_distributor.get_port() + endpoint_id="ep-migration_check", pg_port=pg_port, http_port=http_port ) finally: env.neon_cli.stop() diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 1414b4ed8e..c26ec76172 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -58,11 +58,8 @@ def test_ondemand_download_large_rel( ) ##### First start, insert secret data and upload it to the remote storage - env = neon_env_builder.init_start() - - # Override defaults, to create more layers - tenant, _ = env.neon_cli.create_tenant( - conf={ + env = neon_env_builder.init_start( + initial_tenant_conf={ # disable background GC "gc_period": "0s", "gc_horizon": f"{10 * 1024 ** 3}", # 10 GB @@ -75,7 +72,6 @@ def test_ondemand_download_large_rel( "compaction_period": "0s", } ) - env.initial_tenant = tenant endpoint = env.endpoints.create_start("main") diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py index bc3f3f2be4..fc93dcffbb 100644 --- a/test_runner/regress/test_pageserver_restarts_under_workload.py +++ b/test_runner/regress/test_pageserver_restarts_under_workload.py @@ -17,12 +17,6 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB n_restarts = 10 scale = 10 - # the background task may complete the init task delay after finding an - # active tenant, but shutdown starts right before Tenant::gc_iteration - env.pageserver.allowed_errors.append( - r".*Gc failed, retrying in \S+: Cannot run GC iteration on inactive tenant" - ) - def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 02f1aac99c..baef8ecacc 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -140,7 +140,7 @@ def test_remote_storage_backup_and_restore( # This is before the failures injected by test_remote_failures, so it's a permanent error. pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return")) env.pageserver.allowed_errors.append( - ".*error attaching tenant: storage-sync-list-remote-timelines", + ".*attach failed.*: storage-sync-list-remote-timelines", ) # Attach it. This HTTP request will succeed and launch a # background task to load the tenant. In that background task, @@ -693,15 +693,15 @@ def test_empty_branch_remote_storage_upload_on_restart( f".*POST.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" ) - # index upload is now hitting the failpoint, should not block the shutdown - env.pageserver.stop() + # index upload is now hitting the failpoint, it should block the shutdown + env.pageserver.stop(immediate=True) timeline_path = ( Path("tenants") / str(env.initial_tenant) / "timelines" / str(new_branch_timeline_id) ) local_metadata = env.repo_dir / timeline_path / "metadata" - assert local_metadata.is_file(), "timeout cancelled timeline branching, not the upload" + assert local_metadata.is_file() assert isinstance(env.remote_storage, LocalFsStorage) new_branch_on_remote_storage = env.remote_storage.root / timeline_path diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py index f3aa429c49..9b78e8287e 100644 --- a/test_runner/regress/test_sni_router.py +++ b/test_runner/regress/test_sni_router.py @@ -37,6 +37,7 @@ class PgSniRouter(PgProtocol): destination: str, tls_cert: Path, tls_key: Path, + test_output_dir: Path, ): # Must use a hostname rather than IP here, for SNI to work host = "localhost" @@ -49,6 +50,7 @@ class PgSniRouter(PgProtocol): self.tls_cert = tls_cert self.tls_key = tls_key self._popen: Optional[subprocess.Popen[bytes]] = None + self.test_output_dir = test_output_dir def start(self) -> "PgSniRouter": assert self._popen is None @@ -60,8 +62,12 @@ class PgSniRouter(PgProtocol): *["--destination", self.destination], ] - self._popen = subprocess.Popen(args) + router_log_path = self.test_output_dir / "pg_sni_router.log" + router_log = open(router_log_path, "w") + + self._popen = subprocess.Popen(args, stderr=router_log) self._wait_until_ready() + log.info(f"pg_sni_router started, log file: {router_log_path}") return self @backoff.on_exception(backoff.expo, OSError, max_time=10) @@ -121,6 +127,7 @@ def test_pg_sni_router( destination="localtest.me", tls_cert=test_output_dir / "router.crt", tls_key=test_output_dir / "router.key", + test_output_dir=test_output_dir, ) as router: router.start() diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 82664cff94..9d0fdcfaf8 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -59,6 +59,13 @@ def test_tenant_reattach( # create new nenant tenant_id, timeline_id = env.neon_cli.create_tenant() + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: with endpoint.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") @@ -223,13 +230,6 @@ def test_tenant_reattach_while_busy( ) env = neon_env_builder.init_start() - # Attempts to connect from compute to pageserver while the tenant is - # temporarily detached produces these errors in the pageserver log. - env.pageserver.allowed_errors.append(".*Tenant .* not found.*") - env.pageserver.allowed_errors.append( - ".*Tenant .* will not become active\\. Current state: Stopping.*" - ) - pageserver_http = env.pageserver.http_client() # create new nenant @@ -238,6 +238,13 @@ def test_tenant_reattach_while_busy( conf={"checkpoint_distance": "100000"} ) + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) cur = endpoint.connect().cursor() @@ -275,6 +282,13 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # create new nenant tenant_id, timeline_id = env.neon_cli.create_tenant() + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + # assert tenant exists on disk assert (env.repo_dir / "tenants" / str(tenant_id)).exists() @@ -336,6 +350,13 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv): # create a new tenant tenant_id, _ = env.neon_cli.create_tenant() + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + # assert tenant exists on disk assert (env.repo_dir / "tenants" / str(tenant_id)).exists() @@ -385,6 +406,13 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv): # create a new tenant tenant_id, _ = env.neon_cli.create_tenant() + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + # assert tenant exists on disk assert (env.repo_dir / "tenants" / str(tenant_id)).exists() @@ -399,6 +427,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv): log.info("detaching regular tenant with detach ignored flag") client.tenant_detach(tenant_id, True) + log.info("regular tenant detached without error") # check that nothing is left on disk for deleted tenant @@ -432,6 +461,13 @@ def test_detach_while_attaching( tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point @@ -577,6 +613,13 @@ def test_ignored_tenant_download_missing_layers( tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + data_id = 1 data_secret = "very secret secret" insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) @@ -636,6 +679,13 @@ def test_ignored_tenant_stays_broken_without_metadata( tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Broken.*" + ) + # ignore the tenant and remove its metadata pageserver_http.tenant_ignore(tenant_id) tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) @@ -647,7 +697,9 @@ def test_ignored_tenant_stays_broken_without_metadata( metadata_removed = True assert metadata_removed, f"Failed to find metadata file in {tenant_timeline_dir}" - env.pageserver.allowed_errors.append(".*could not load tenant .*?: failed to load metadata.*") + env.pageserver.allowed_errors.append( + f".*{tenant_id}.*: load failed.*: failed to load metadata.*" + ) # now, load it from the local files and expect it to be broken due to inability to load tenant files into memory pageserver_http.tenant_load(tenant_id=tenant_id) @@ -670,6 +722,13 @@ def test_load_attach_negatives( tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*") with pytest.raises( expected_exception=PageserverApiException, @@ -712,6 +771,13 @@ def test_ignore_while_attaching( tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) + # Attempts to connect from compute to pageserver while the tenant is + # temporarily detached produces these errors in the pageserver log. + env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") + env.pageserver.allowed_errors.append( + f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" + ) + data_id = 1 data_secret = "very secret secret" insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 60ab268882..e9dcd1e5cd 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -318,7 +318,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa def test_single_branch_get_tenant_size_grows( - neon_env_builder: NeonEnvBuilder, test_output_dir: Path + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion ): """ Operate on single branch reading the tenants size after each transaction. @@ -333,6 +333,13 @@ def test_single_branch_get_tenant_size_grows( # that there next_gc_cutoff could be smaller than initdb_lsn, which will # obviously lead to issues when calculating the size. gc_horizon = 0x38000 + + # it's a bit of a hack, but different versions of postgres have different + # amount of WAL generated for the same amount of data. so we need to + # adjust the gc_horizon accordingly. + if pg_version == PgVersion.V14: + gc_horizon = 0x40000 + neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}" env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 5642449ce6..aef2df4932 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -22,6 +22,7 @@ from fixtures.neon_fixtures import ( available_remote_storages, ) from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import wait_until from prometheus_client.samples import Sample @@ -266,6 +267,7 @@ def test_pageserver_metrics_removed_after_detach( cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") cur.execute("SELECT sum(key) FROM t") assert cur.fetchone() == (5000050000,) + endpoint.stop() def get_ps_metric_samples_for_tenant(tenant_id: TenantId) -> List[Sample]: ps_metrics = env.pageserver.http_client().get_metrics() @@ -308,27 +310,26 @@ def test_pageserver_with_empty_tenants( env.pageserver.allowed_errors.append( ".*marking .* as locally complete, while it doesnt exist in remote index.*" ) - env.pageserver.allowed_errors.append( - ".*could not load tenant.*Failed to list timelines directory.*" - ) + env.pageserver.allowed_errors.append(".*load failed.*list timelines directory.*") client = env.pageserver.http_client() - tenant_with_empty_timelines_dir = client.tenant_create() - temp_timelines = client.timeline_list(tenant_with_empty_timelines_dir) + tenant_with_empty_timelines = TenantId.generate() + client.tenant_create(tenant_with_empty_timelines) + temp_timelines = client.timeline_list(tenant_with_empty_timelines) for temp_timeline in temp_timelines: client.timeline_delete( - tenant_with_empty_timelines_dir, TimelineId(temp_timeline["timeline_id"]) + tenant_with_empty_timelines, TimelineId(temp_timeline["timeline_id"]) ) files_in_timelines_dir = sum( 1 for _p in Path.iterdir( - Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines" + Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines) / "timelines" ) ) assert ( files_in_timelines_dir == 0 - ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory" + ), f"Tenant {tenant_with_empty_timelines} should have an empty timelines/ directory" # Trigger timeline re-initialization after pageserver restart env.endpoints.stop_all() @@ -340,9 +341,15 @@ def test_pageserver_with_empty_tenants( env.pageserver.start() client = env.pageserver.http_client() - tenants = client.tenant_list() - assert len(tenants) == 2 + def not_loading(): + tenants = client.tenant_list() + assert len(tenants) == 2 + assert all(t["state"]["slug"] != "Loading" for t in tenants) + + wait_until(10, 0.2, not_loading) + + tenants = client.tenant_list() [broken_tenant] = [t for t in tenants if t["id"] == str(tenant_without_timelines_dir)] assert ( @@ -354,17 +361,17 @@ def test_pageserver_with_empty_tenants( broken_tenant_status["state"]["slug"] == "Broken" ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken" - assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*") + assert env.pageserver.log_contains(".*load failed, setting tenant state to Broken:.*") - [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)] + [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines)] assert ( loaded_tenant["state"]["slug"] == "Active" - ), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation" + ), "Tenant {tenant_with_empty_timelines} with empty timelines dir should be active and ready for timeline creation" - loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir) + loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines) assert ( loaded_tenant_status["state"]["slug"] == "Active" - ), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active" + ), f"Tenant {tenant_with_empty_timelines} without timelines dir should be active" time.sleep(1) # to allow metrics propagation @@ -374,7 +381,7 @@ def test_pageserver_with_empty_tenants( "state": "Broken", } active_tenants_metric_filter = { - "tenant_id": str(tenant_with_empty_timelines_dir), + "tenant_id": str(tenant_with_empty_timelines), "state": "Active", } @@ -386,7 +393,7 @@ def test_pageserver_with_empty_tenants( assert ( tenant_active_count == 1 - ), f"Tenant {tenant_with_empty_timelines_dir} should have metric as active" + ), f"Tenant {tenant_with_empty_timelines} should have metric as active" tenant_broken_count = int( ps_metrics.query_one( diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 7135b621cb..be79538843 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -271,8 +271,9 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild env.pageserver.allowed_errors.append( ".*Ignoring new state, equal to the existing one: Stopping" ) + # this happens, because the stuck timeline is visible to shutdown env.pageserver.allowed_errors.append( - ".*during shutdown: cannot flush frozen layers when flush_loop is not running, state is Exited" + ".*freeze_and_flush_on_shutdown.+: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited" ) ps_http = env.pageserver.http_client() @@ -371,7 +372,7 @@ def test_concurrent_timeline_delete_if_first_stuck_at_index_upload( # make the second call and assert behavior log.info("second call start") - error_msg_re = "another task is already setting the deleted_flag, started at" + error_msg_re = "timeline deletion is already in progress" with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err: ps_http.timeline_delete(env.initial_tenant, child_timeline_id) assert second_call_err.value.status_code == 500 @@ -437,12 +438,22 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): wait_until(50, 0.1, got_hangup_log_message) - # ok, retry without failpoint, it should succeed + # check that the timeline is still present + ps_http.timeline_detail(env.initial_tenant, child_timeline_id) + + # ok, disable the failpoint to let the deletion finish ps_http.configure_failpoints((failpoint_name, "off")) - # this should succeed - ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2) - # the second call will try to transition the timeline into Stopping state, but it's already in that state - env.pageserver.allowed_errors.append( - f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping" - ) + def first_request_finished(): + message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished" + assert env.pageserver.log_contains(message) + + wait_until(50, 0.1, first_request_finished) + + # check that the timeline is gone + notfound_message = f"Timeline {env.initial_tenant}/{child_timeline_id} was not found" + env.pageserver.allowed_errors.append(".*" + notfound_message) + with pytest.raises(PageserverApiException, match=notfound_message) as exc: + ps_http.timeline_detail(env.initial_tenant, child_timeline_id) + + assert exc.value.status_code == 404 diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 2a4141ed30..8b595596cb 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1001,9 +1001,6 @@ def test_safekeeper_without_pageserver( def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): - def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str: - return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names]) - def execute_payload(endpoint: Endpoint): with closing(endpoint.connect()) as conn: with conn.cursor() as cur: @@ -1032,9 +1029,8 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() - active_safekeepers = [1, 2, 3] endpoint = env.endpoints.create("test_replace_safekeeper") - endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) + endpoint.active_safekeepers = [1, 2, 3] endpoint.start() # learn neon timeline from compute @@ -1072,9 +1068,8 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): log.info("Recreate postgres to replace failed sk1 with new sk4") endpoint.stop_and_destroy().create("test_replace_safekeeper") - active_safekeepers = [2, 3, 4] env.safekeepers[3].start() - endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) + endpoint.active_safekeepers = [2, 3, 4] endpoint.start() execute_payload(endpoint) @@ -1293,9 +1288,8 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() - active_safekeepers = [1, 2, 3] endpoint = env.endpoints.create("test_pull_timeline") - endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) + endpoint.active_safekeepers = [1, 2, 3] endpoint.start() # learn neon timeline from compute @@ -1332,10 +1326,8 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) log.info("Restarting compute with new config to verify that it works") - active_safekeepers = [1, 3, 4] - endpoint.stop_and_destroy().create("test_pull_timeline") - endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) + endpoint.active_safekeepers = [1, 3, 4] endpoint.start() execute_payload(endpoint) diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 7debeed140..ce33975a0e 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -2,9 +2,11 @@ import asyncio import random import time from dataclasses import dataclass +from pathlib import Path from typing import List, Optional import asyncpg +import toml from fixtures.log_helper import getLogger from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper from fixtures.types import Lsn, TenantId, TimelineId @@ -251,7 +253,8 @@ def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): endpoint = Endpoint( env, tenant_id=env.initial_tenant, - port=env.port_distributor.get_port(), + pg_port=env.port_distributor.get_port(), + http_port=env.port_distributor.get_port(), # In these tests compute has high probability of terminating on its own # before our stop() due to lost consensus leadership. check_stop_result=False, @@ -536,15 +539,20 @@ def test_race_conditions(neon_env_builder: NeonEnvBuilder): # Check that pageserver can select safekeeper with largest commit_lsn # and switch if LSN is not updated for some time (NoWalTimeout). -async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint): - def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str: - # use ports 10, 11 and 12 to simulate unavailable safekeepers - return ",".join( - [ - f"localhost:{sk.port.pg if active else 10 + i}" - for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk)) - ] - ) +async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Path): + def adjust_safekeepers(env: NeonEnv, active_sk: List[bool]): + # Change the pg ports of the inactive safekeepers in the config file to be + # invalid, to make them unavailable to the endpoint. We use + # ports 10, 11 and 12 to simulate unavailable safekeepers. + config = toml.load(test_output_dir / "repo" / "config") + for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk)): + if active: + config["safekeepers"][i]["pg_port"] = env.safekeepers[i].port.pg + else: + config["safekeepers"][i]["pg_port"] = 10 + i + + with open(test_output_dir / "repo" / "config", "w") as f: + toml.dump(config, f) conn = await endpoint.connect_async() await conn.execute("CREATE TABLE t(key int primary key, value text)") @@ -565,7 +573,7 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint): it -= 1 continue - endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_sk)) + adjust_safekeepers(env, active_sk) log.info(f"Iteration {it}: {active_sk}") endpoint.start() @@ -579,7 +587,7 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint): await conn.close() endpoint.stop() - endpoint.adjust_for_safekeepers(safekeepers_guc(env, [True] * len(env.safekeepers))) + adjust_safekeepers(env, [True] * len(env.safekeepers)) endpoint.start() conn = await endpoint.connect_async() @@ -590,11 +598,11 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint): # do inserts while restarting postgres and messing with safekeeper addresses -def test_wal_lagging(neon_env_builder: NeonEnvBuilder): +def test_wal_lagging(neon_env_builder: NeonEnvBuilder, test_output_dir: Path): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() env.neon_cli.create_branch("test_wal_lagging") endpoint = env.endpoints.create_start("test_wal_lagging") - asyncio.run(run_wal_lagging(env, endpoint)) + asyncio.run(run_wal_lagging(env, endpoint, test_output_dir)) diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index 8e4e154be1..515d47c079 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -77,7 +77,8 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil try: trigger_wait_lsn_timeout(env, tenant_id) except Exception as e: - exception_string = str(e) + # Strip out the part before stdout, as it contains full command with the list of all safekeepers + exception_string = str(e).split("stdout", 1)[-1] assert expected_timeout_error in exception_string, "Should time out during waiting for WAL" for safekeeper in env.safekeepers: diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index 7d944bebb3..4a47898935 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -83,6 +83,9 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): # XXX this is quite brittle as the lifecycle of the WAL redo process is an implementation detail assert_child_processes(pagserver_pid, wal_redo_present=True, defunct_present=False) + # Stop the compute before detaching, to avoid errors in the log. + endpoint.stop() + last_error = None for i in range(3): try: diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 3d40f5dede..677b59f453 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -27,7 +27,6 @@ futures-core = { version = "0.3" } futures-executor = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } -hashbrown = { version = "0.12", features = ["raw"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -39,7 +38,7 @@ num-traits = { version = "0.2", features = ["i128"] } prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } -regex-syntax = { version = "0.6" } +regex-syntax = { version = "0.7" } reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] } ring = { version = "0.16", features = ["std"] } rustls = { version = "0.20", features = ["dangerous_configuration"] } @@ -62,7 +61,6 @@ url = { version = "2", features = ["serde"] } anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } either = { version = "1" } -hashbrown = { version = "0.12", features = ["raw"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -70,7 +68,7 @@ memchr = { version = "2" } nom = { version = "7" } prost = { version = "0.11" } regex = { version = "1" } -regex-syntax = { version = "0.6" } +regex-syntax = { version = "0.7" } serde = { version = "1", features = ["alloc", "derive"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] } syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit-mut"] }