diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml index d60f97320b..5cdc16f248 100644 --- a/.github/workflows/_benchmarking_preparation.yml +++ b/.github/workflows/_benchmarking_preparation.yml @@ -27,7 +27,7 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned + image: neondatabase/build-tools:pinned-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 3aa671fab1..c0f59fbdd5 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -124,28 +124,28 @@ jobs: uses: actions/cache@v4 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} - name: Cache postgres v15 build id: cache_pg_15 uses: actions/cache@v4 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} - name: Cache postgres v16 build id: cache_pg_16 uses: actions/cache@v4 with: path: pg_install/v16 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} - name: Cache postgres v17 build id: cache_pg_17 uses: actions/cache@v4 with: path: pg_install/v17 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 32806b89ab..5ccfe48684 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -83,7 +83,7 @@ jobs: runs-on: ${{ matrix.RUNNER }} container: - image: neondatabase/build-tools:pinned + image: neondatabase/build-tools:pinned-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -178,7 +178,7 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned + image: neondatabase/build-tools:pinned-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -280,7 +280,7 @@ jobs: region_id_default=${{ env.DEFAULT_REGION_ID }} runner_default='["self-hosted", "us-east-2", "x64"]' runner_azure='["self-hosted", "eastus2", "x64"]' - image_default="neondatabase/build-tools:pinned" + image_default="neondatabase/build-tools:pinned-bookworm" matrix='{ "pg_version" : [ 16 @@ -299,9 +299,9 @@ jobs: "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] }' @@ -665,7 +665,7 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned + image: neondatabase/build-tools:pinned-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -772,7 +772,7 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned + image: neondatabase/build-tools:pinned-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -877,7 +877,7 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned + image: neondatabase/build-tools:pinned-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 0f05276579..10750089b2 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -82,7 +82,7 @@ jobs: - uses: docker/build-push-action@v6 with: - file: Dockerfile.build-tools + file: build-tools.Dockerfile context: . provenance: false push: true diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index b669eaeb11..1186b9927b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -683,7 +683,7 @@ jobs: provenance: false push: true pull: true - file: compute/Dockerfile.compute-node + file: compute/compute-node.Dockerfile cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | @@ -703,7 +703,7 @@ jobs: provenance: false push: true pull: true - file: compute/Dockerfile.compute-node + file: compute/compute-node.Dockerfile target: neon-pg-ext-test cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} @@ -728,7 +728,7 @@ jobs: provenance: false push: true pull: true - file: compute/Dockerfile.compute-node + file: compute/compute-node.Dockerfile cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml index 807a9ef3bd..a7a15ad58b 100644 --- a/.github/workflows/check-build-tools-image.yml +++ b/.github/workflows/check-build-tools-image.yml @@ -31,7 +31,7 @@ jobs: id: get-build-tools-tag env: IMAGE_TAG: | - ${{ hashFiles('Dockerfile.build-tools', + ${{ hashFiles('build-tools.Dockerfile', '.github/workflows/check-build-tools-image.yml', '.github/workflows/build-build-tools-image.yml') }} run: | diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index ecafe183f8..19ebf457b8 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -31,7 +31,7 @@ jobs: runs-on: us-east-2 container: - image: neondatabase/build-tools:pinned + image: neondatabase/build-tools:pinned-bookworm options: --init steps: diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index 5c5423e252..1e7264c55a 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -112,7 +112,7 @@ jobs: # This isn't exhaustive, just the paths that are most directly compute-related. # For example, compute_ctl also depends on libs/utils, but we don't trigger # an e2e run on that. - vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/Dockerfile.compute-node) + vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/compute-node.Dockerfile) platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique') ;; *) diff --git a/Cargo.lock b/Cargo.lock index 6b212bac2e..ad29fa4634 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -148,9 +148,9 @@ dependencies = [ [[package]] name = "asn1-rs" -version = "0.5.2" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f6fd5ddaf0351dff5b8da21b2fb4ff8e08ddd02857f0bf69c47639106c0fff0" +checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048" dependencies = [ "asn1-rs-derive", "asn1-rs-impl", @@ -164,25 +164,25 @@ dependencies = [ [[package]] name = "asn1-rs-derive" -version = "0.4.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c" +checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.52", "synstructure", ] [[package]] name = "asn1-rs-impl" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed" +checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] @@ -310,6 +310,33 @@ dependencies = [ "zeroize", ] +[[package]] +name = "aws-lc-rs" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f95446d919226d587817a7d21379e6eb099b97b45110a7f272a444ca5c54070" +dependencies = [ + "aws-lc-sys", + "mirai-annotations", + "paste", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3ddc4a5b231dd6958b140ff3151b6412b3f4321fab354f399eec8f14b06df62" +dependencies = [ + "bindgen 0.69.5", + "cc", + "cmake", + "dunce", + "fs_extra", + "libc", + "paste", +] + [[package]] name = "aws-runtime" version = "1.4.3" @@ -595,7 +622,7 @@ dependencies = [ "once_cell", "pin-project-lite", "pin-utils", - "rustls 0.21.11", + "rustls 0.21.12", "tokio", "tracing", ] @@ -915,6 +942,29 @@ dependencies = [ "serde", ] +[[package]] +name = "bindgen" +version = "0.69.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" +dependencies = [ + "bitflags 2.4.1", + "cexpr", + "clang-sys", + "itertools 0.10.5", + "lazy_static", + "lazycell", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.52", + "which", +] + [[package]] name = "bindgen" version = "0.70.1" @@ -924,7 +974,7 @@ dependencies = [ "bitflags 2.4.1", "cexpr", "clang-sys", - "itertools 0.12.1", + "itertools 0.10.5", "log", "prettyplease", "proc-macro2", @@ -1038,12 +1088,13 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.83" +version = "1.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945" dependencies = [ "jobserver", "libc", + "shlex", ] [[package]] @@ -1169,6 +1220,15 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" +[[package]] +name = "cmake" +version = "0.1.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.0" @@ -1624,9 +1684,9 @@ dependencies = [ [[package]] name = "der-parser" -version = "8.2.0" +version = "9.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbd676fbbab537128ef0278adb5576cf363cff6aa22a7b24effe97347cfab61e" +checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553" dependencies = [ "asn1-rs", "displaydoc", @@ -1755,6 +1815,12 @@ dependencies = [ "syn 2.0.52", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "dyn-clone" version = "1.0.14" @@ -2059,6 +2125,12 @@ dependencies = [ "tokio-util", ] +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "fsevent-sys" version = "4.1.0" @@ -2412,6 +2484,15 @@ dependencies = [ "digest", ] +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "hostname" version = "0.4.0" @@ -2581,7 +2662,7 @@ dependencies = [ "http 0.2.9", "hyper 0.14.30", "log", - "rustls 0.21.11", + "rustls 0.21.12", "rustls-native-certs 0.6.2", "tokio", "tokio-rustls 0.24.0", @@ -2801,9 +2882,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jobserver" -version = "0.1.26" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" dependencies = [ "libc", ] @@ -2907,6 +2988,12 @@ dependencies = [ "spin", ] +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + [[package]] name = "libc" version = "0.2.150" @@ -3137,6 +3224,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "mirai-annotations" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1" + [[package]] name = "multimap" version = "0.8.3" @@ -3356,9 +3449,9 @@ dependencies = [ [[package]] name = "oid-registry" -version = "0.6.1" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9bedf36ffb6ba96c2eb7144ef6270557b52e54b20c0a8e1eb2ff99a6c6959bff" +checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9" dependencies = [ "asn1-rs", ] @@ -4053,14 +4146,14 @@ dependencies = [ "bytes", "once_cell", "pq_proto", - "rustls 0.22.4", + "rustls 0.23.7", "rustls-pemfile 2.1.1", "serde", "thiserror", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls 0.25.0", + "tokio-rustls 0.26.0", "tokio-util", "tracing", ] @@ -4082,7 +4175,7 @@ name = "postgres_ffi" version = "0.1.0" dependencies = [ "anyhow", - "bindgen", + "bindgen 0.70.1", "bytes", "crc32c", "env_logger", @@ -4219,7 +4312,7 @@ checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15" dependencies = [ "bytes", "heck 0.5.0", - "itertools 0.12.1", + "itertools 0.10.5", "log", "multimap", "once_cell", @@ -4239,7 +4332,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" dependencies = [ "anyhow", - "itertools 0.12.1", + "itertools 0.10.5", "proc-macro2", "quote", "syn 2.0.52", @@ -4327,8 +4420,8 @@ dependencies = [ "rsa", "rstest", "rustc-hash", - "rustls 0.22.4", - "rustls-native-certs 0.7.0", + "rustls 0.23.7", + "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", "scopeguard", "serde", @@ -4345,7 +4438,7 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls 0.25.0", + "tokio-rustls 0.26.0", "tokio-tungstenite", "tokio-util", "tracing", @@ -4509,12 +4602,13 @@ dependencies = [ [[package]] name = "rcgen" -version = "0.12.1" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1" +checksum = "54077e1872c46788540de1ea3d7f4ccb1983d12f9aa909b234468676c1a36779" dependencies = [ "pem", "ring", + "rustls-pki-types", "time", "yasna", ] @@ -4693,7 +4787,7 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls 0.21.11", + "rustls 0.21.12", "rustls-pemfile 1.0.2", "serde", "serde_json", @@ -4991,9 +5085,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.21.11" +version = "0.21.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4" +checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" dependencies = [ "log", "ring", @@ -5021,6 +5115,7 @@ version = "0.23.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b" dependencies = [ + "aws-lc-rs", "log", "once_cell", "ring", @@ -5089,9 +5184,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.3.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8" +checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" [[package]] name = "rustls-webpki" @@ -5109,6 +5204,7 @@ version = "0.102.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610" dependencies = [ + "aws-lc-rs", "ring", "rustls-pki-types", "untrusted", @@ -5312,7 +5408,7 @@ checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02" dependencies = [ "httpdate", "reqwest 0.12.4", - "rustls 0.21.11", + "rustls 0.21.12", "sentry-backtrace", "sentry-contexts", "sentry-core", @@ -5807,8 +5903,8 @@ dependencies = [ "postgres_ffi", "remote_storage", "reqwest 0.12.4", - "rustls 0.22.4", - "rustls-native-certs 0.7.0", + "rustls 0.23.7", + "rustls-native-certs 0.8.0", "serde", "serde_json", "storage_controller_client", @@ -5930,14 +6026,13 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" [[package]] name = "synstructure" -version = "0.12.6" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", - "unicode-xid", + "syn 2.0.52", ] [[package]] @@ -6236,16 +6331,15 @@ dependencies = [ [[package]] name = "tokio-postgres-rustls" -version = "0.11.1" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677" +checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab" dependencies = [ - "futures", "ring", - "rustls 0.22.4", + "rustls 0.23.7", "tokio", "tokio-postgres", - "tokio-rustls 0.25.0", + "tokio-rustls 0.26.0", "x509-certificate", ] @@ -6255,7 +6349,7 @@ version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" dependencies = [ - "rustls 0.21.11", + "rustls 0.21.12", "tokio", ] @@ -6678,16 +6772,15 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "ureq" -version = "2.9.7" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd" +checksum = "b74fc6b57825be3373f7054754755f03ac3a8f5d70015ccad699ba2029956f4a" dependencies = [ "base64 0.22.1", "log", "once_cell", - "rustls 0.22.4", + "rustls 0.23.7", "rustls-pki-types", - "rustls-webpki 0.102.2", "url", "webpki-roots 0.26.1", ] @@ -6876,7 +6969,7 @@ name = "walproposer" version = "0.1.0" dependencies = [ "anyhow", - "bindgen", + "bindgen 0.70.1", "postgres_ffi", "utils", ] @@ -7051,6 +7144,18 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + [[package]] name = "whoami" version = "1.5.1" @@ -7295,7 +7400,6 @@ dependencies = [ "digest", "either", "fail", - "futures", "futures-channel", "futures-executor", "futures-io", @@ -7311,7 +7415,7 @@ dependencies = [ "hyper-util", "indexmap 1.9.3", "indexmap 2.0.1", - "itertools 0.12.1", + "itertools 0.10.5", "lazy_static", "libc", "log", @@ -7332,6 +7436,8 @@ dependencies = [ "regex-automata 0.4.3", "regex-syntax 0.8.2", "reqwest 0.12.4", + "rustls 0.23.7", + "rustls-webpki 0.102.2", "scopeguard", "serde", "serde_json", @@ -7340,7 +7446,6 @@ dependencies = [ "smallvec", "spki 0.7.3", "subtle", - "syn 1.0.109", "syn 2.0.52", "sync_wrapper 0.1.2", "tikv-jemalloc-sys", @@ -7348,6 +7453,7 @@ dependencies = [ "time-macros", "tokio", "tokio-postgres", + "tokio-rustls 0.26.0", "tokio-stream", "tokio-util", "toml_edit", @@ -7383,9 +7489,9 @@ dependencies = [ [[package]] name = "x509-parser" -version = "0.15.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bab0c2f54ae1d92f4fcb99c0b7ccf0b1e3451cbd395e5f115ccbdbcb18d4f634" +checksum = "fcbc162f30700d6f3f82a24bf7cc62ffe7caea42c0b2cba8bf7f3ae50cf51f69" dependencies = [ "asn1-rs", "data-encoding", diff --git a/Cargo.toml b/Cargo.toml index a1a974b33b..4c6a24ecde 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -142,7 +142,7 @@ reqwest-retry = "0.5" routerify = "3" rpds = "0.13" rustc-hash = "1.1.0" -rustls = "0.22" +rustls = "0.23" rustls-pemfile = "2" scopeguard = "1.1" sysinfo = "0.29.2" @@ -172,8 +172,8 @@ tikv-jemalloc-ctl = "0.5" tokio = { version = "1.17", features = ["macros"] } tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } tokio-io-timeout = "1.2.0" -tokio-postgres-rustls = "0.11.0" -tokio-rustls = "0.25" +tokio-postgres-rustls = "0.12.0" +tokio-rustls = "0.26" tokio-stream = "0.1" tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } @@ -192,8 +192,8 @@ url = "2.2" urlencoding = "2.1" uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] } walkdir = "2.3.2" -rustls-native-certs = "0.7" -x509-parser = "0.15" +rustls-native-certs = "0.8" +x509-parser = "0.16" whoami = "1.5.1" ## TODO replace this with tracing @@ -244,7 +244,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" } ## Build dependencies criterion = "0.5.1" -rcgen = "0.12" +rcgen = "0.13" rstest = "0.18" camino-tempfile = "1.0.2" tonic-build = "0.12" diff --git a/Makefile b/Makefile index 33cfda2661..8e3b755112 100644 --- a/Makefile +++ b/Makefile @@ -297,7 +297,7 @@ clean: postgres-clean neon-pg-clean-ext # This removes everything .PHONY: distclean distclean: - rm -rf $(POSTGRES_INSTALL_DIR) + $(RM) -r $(POSTGRES_INSTALL_DIR) $(CARGO_CMD_PREFIX) cargo clean .PHONY: fmt @@ -329,7 +329,7 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \ $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \ --excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns - rm -f pg*.BAK + $(RM) pg*.BAK # Indent pxgn/neon. .PHONY: neon-pgindent diff --git a/Dockerfile.build-tools b/build-tools.Dockerfile similarity index 98% rename from Dockerfile.build-tools rename to build-tools.Dockerfile index 7cba1c8635..818cc1b6db 100644 --- a/Dockerfile.build-tools +++ b/build-tools.Dockerfile @@ -72,7 +72,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/ && mv s5cmd /usr/local/bin/s5cmd # LLVM -ENV LLVM_VERSION=18 +ENV LLVM_VERSION=19 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && apt update \ @@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws && rm awscliv2.zip # Mold: A Modern Linker -ENV MOLD_VERSION=v2.33.0 +ENV MOLD_VERSION=v2.34.1 RUN set -e \ && git clone https://github.com/rui314/mold.git \ && mkdir mold/build \ @@ -142,7 +142,7 @@ RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/sourc # Use the same version of libicu as the compute nodes so that # clusters created using inidb on pageserver can be used by computes. # -# TODO: at this time, Dockerfile.compute-node uses the debian bullseye libicu +# TODO: at this time, compute-node.Dockerfile uses the debian bullseye libicu # package, which is 67.1. We're duplicating that knowledge here, and also, technically, # Debian has a few patches on top of 67.1 that we're not adding here. ENV ICU_VERSION=67.1 @@ -192,7 +192,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.81.0 +ENV RUSTC_VERSION=1.82.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 diff --git a/compute/Makefile b/compute/Makefile index e4f08a223c..645880ce70 100644 --- a/compute/Makefile +++ b/compute/Makefile @@ -20,19 +20,21 @@ neon_collector_autoscaling.yml: $(jsonnet_files) sql_exporter.yml: $(jsonnet_files) JSONNET_PATH=etc jsonnet \ --output-file etc/$@ \ + --tla-str collector_name=neon_collector \ --tla-str collector_file=neon_collector.yml \ etc/sql_exporter.jsonnet sql_exporter_autoscaling.yml: $(jsonnet_files) JSONNET_PATH=etc jsonnet \ --output-file etc/$@ \ + --tla-str collector_name=neon_collector_autoscaling \ --tla-str collector_file=neon_collector_autoscaling.yml \ --tla-str application_name=sql_exporter_autoscaling \ etc/sql_exporter.jsonnet .PHONY: clean clean: - rm --force \ + $(RM) \ etc/neon_collector.yml \ etc/neon_collector_autoscaling.yml \ etc/sql_exporter.yml \ diff --git a/compute/README.md b/compute/README.md index bb1e42ab53..61e0eee4be 100644 --- a/compute/README.md +++ b/compute/README.md @@ -1,7 +1,7 @@ This directory contains files that are needed to build the compute images, or included in the compute images. -Dockerfile.compute-node +compute-node.Dockerfile To build the compute image vm-image-spec.yaml @@ -14,8 +14,8 @@ etc/ patches/ Some extensions need to be patched to work with Neon. This directory contains such patches. They are applied to the extension - sources in Dockerfile.compute-node + sources in compute-node.Dockerfile In addition to these, postgres itself, the neon postgres extension, and compute_ctl are built and copied into the compute image by -Dockerfile.compute-node. +compute-node.Dockerfile. diff --git a/compute/Dockerfile.compute-node b/compute/compute-node.Dockerfile similarity index 99% rename from compute/Dockerfile.compute-node rename to compute/compute-node.Dockerfile index b0ce7c1718..6451e309f0 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/compute-node.Dockerfile @@ -353,13 +353,10 @@ COPY compute/patches/pgvector.patch /pgvector.patch # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. # -# v17 is not supported yet because of upstream issue -# https://github.com/pgvector/pgvector/issues/669 -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ - echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \ +# vector 0.7.4 supports v17 +# last release v0.7.4 - Aug 5, 2024 +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.4.tar.gz -O pgvector.tar.gz && \ + echo "0341edf89b1924ae0d552f617e14fb7f8867c0194ed775bcc44fa40288642583 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ patch -p1 < /pgvector.patch && \ make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -978,8 +975,8 @@ ARG PG_VERSION RUN case "${PG_VERSION}" in "v17") \ echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \ esac && \ - wget https://github.com/neondatabase/pg_session_jwt/archive/5aee2625af38213650e1a07ae038fdc427250ee4.tar.gz -O pg_session_jwt.tar.gz && \ - echo "5d91b10bc1347d36cffc456cb87bec25047935d6503dc652ca046f04760828e7 pg_session_jwt.tar.gz" | sha256sum --check && \ + wget https://github.com/neondatabase/pg_session_jwt/archive/e1310b08ba51377a19e0559e4d1194883b9b2ba2.tar.gz -O pg_session_jwt.tar.gz && \ + echo "837932a077888d5545fd54b0abcc79e5f8e37017c2769a930afc2f5c94df6f4e pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet index 640e2ac38d..3c36fd4f68 100644 --- a/compute/etc/sql_exporter.jsonnet +++ b/compute/etc/sql_exporter.jsonnet @@ -1,4 +1,4 @@ -function(collector_file, application_name='sql_exporter') { +function(collector_name, collector_file, application_name='sql_exporter') { // Configuration for sql_exporter for autoscaling-agent // Global defaults. global: { @@ -28,7 +28,7 @@ function(collector_file, application_name='sql_exporter') { // Collectors (referenced by name) to execute on the target. // Glob patterns are supported (see for syntax). collectors: [ - 'neon_collector', + collector_name, ], }, diff --git a/compute/etc/sql_exporter/checkpoints_timed.libsonnet b/compute/etc/sql_exporter/checkpoints_timed.libsonnet index 0ba0080188..ebe2ddc9f2 100644 --- a/compute/etc/sql_exporter/checkpoints_timed.libsonnet +++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet @@ -1,7 +1,7 @@ local neon = import 'neon.libsonnet'; -local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql'; -local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql'; +local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_timed.sql'; +local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_timed.17.sql'; { metric_name: 'checkpoints_timed', diff --git a/compute/etc/sql_exporter/retained_wal.sql b/compute/etc/sql_exporter/retained_wal.sql index 6c58359461..3e2aadfc28 100644 --- a/compute/etc/sql_exporter/retained_wal.sql +++ b/compute/etc/sql_exporter/retained_wal.sql @@ -1,5 +1,10 @@ SELECT slot_name, - pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal + pg_wal_lsn_diff( + CASE + WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() + ELSE pg_current_wal_lsn() + END, + restart_lsn)::FLOAT8 AS retained_wal FROM pg_replication_slots WHERE active = false; diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 285be56264..c9dd4dcfc5 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -15,6 +15,7 @@ use std::time::Instant; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; +use compute_api::spec::PgIdent; use futures::future::join_all; use futures::stream::FuturesUnordered; use futures::StreamExt; @@ -25,8 +26,9 @@ use tracing::{debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use compute_api::privilege::Privilege; use compute_api::responses::{ComputeMetrics, ComputeStatus}; -use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec}; +use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion}; use utils::measured_stream::MeasuredReader; use nix::sys::signal::{kill, Signal}; @@ -34,6 +36,7 @@ use nix::sys::signal::{kill, Signal}; use remote_storage::{DownloadError, RemotePath}; use crate::checker::create_availability_check_data; +use crate::installed_extensions::get_installed_extensions_sync; use crate::local_proxy; use crate::logger::inlinify; use crate::pg_helpers::*; @@ -1121,6 +1124,11 @@ impl ComputeNode { self.pg_reload_conf()?; } self.post_apply_config()?; + + let connstr = self.connstr.clone(); + thread::spawn(move || { + get_installed_extensions_sync(connstr).context("get_installed_extensions") + }); } let startup_end_time = Utc::now(); @@ -1367,6 +1375,97 @@ LIMIT 100", download_size } + pub async fn set_role_grants( + &self, + db_name: &PgIdent, + schema_name: &PgIdent, + privileges: &[Privilege], + role_name: &PgIdent, + ) -> Result<()> { + use tokio_postgres::config::Config; + use tokio_postgres::NoTls; + + let mut conf = Config::from_str(self.connstr.as_str()).unwrap(); + conf.dbname(db_name); + + let (db_client, conn) = conf + .connect(NoTls) + .await + .context("Failed to connect to the database")?; + tokio::spawn(conn); + + // TODO: support other types of grants apart from schemas? + let query = format!( + "GRANT {} ON SCHEMA {} TO {}", + privileges + .iter() + // should not be quoted as it's part of the command. + // is already sanitized so it's ok + .map(|p| p.as_str()) + .collect::>() + .join(", "), + // quote the schema and role name as identifiers to sanitize them. + schema_name.pg_quote(), + role_name.pg_quote(), + ); + db_client + .simple_query(&query) + .await + .with_context(|| format!("Failed to execute query: {}", query))?; + + Ok(()) + } + + pub async fn install_extension( + &self, + ext_name: &PgIdent, + db_name: &PgIdent, + ext_version: ExtVersion, + ) -> Result { + use tokio_postgres::config::Config; + use tokio_postgres::NoTls; + + let mut conf = Config::from_str(self.connstr.as_str()).unwrap(); + conf.dbname(db_name); + + let (db_client, conn) = conf + .connect(NoTls) + .await + .context("Failed to connect to the database")?; + tokio::spawn(conn); + + let version_query = "SELECT extversion FROM pg_extension WHERE extname = $1"; + let version: Option = db_client + .query_opt(version_query, &[&ext_name]) + .await + .with_context(|| format!("Failed to execute query: {}", version_query))? + .map(|row| row.get(0)); + + // sanitize the inputs as postgres idents. + let ext_name: String = ext_name.pg_quote(); + let quoted_version: String = ext_version.pg_quote(); + + if let Some(installed_version) = version { + if installed_version == ext_version { + return Ok(installed_version); + } + let query = format!("ALTER EXTENSION {ext_name} UPDATE TO {quoted_version}"); + db_client + .simple_query(&query) + .await + .with_context(|| format!("Failed to execute query: {}", query))?; + } else { + let query = + format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}"); + db_client + .simple_query(&query) + .await + .with_context(|| format!("Failed to execute query: {}", query))?; + } + + Ok(ext_version) + } + #[tokio::main] pub async fn prepare_preload_libraries( &self, @@ -1484,28 +1583,6 @@ LIMIT 100", info!("Pageserver config changed"); } } - - // Gather info about installed extensions - pub fn get_installed_extensions(&self) -> Result<()> { - let connstr = self.connstr.clone(); - - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to create runtime"); - let result = rt - .block_on(crate::installed_extensions::get_installed_extensions( - connstr, - )) - .expect("failed to get installed extensions"); - - info!( - "{}", - serde_json::to_string(&result).expect("failed to serialize extensions list") - ); - - Ok(()) - } } pub fn forward_termination_signal() { diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 6ef7e0837f..da2d107b54 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -107,7 +107,7 @@ pub fn get_pg_version(pgbin: &str) -> String { // pg_config --version returns a (platform specific) human readable string // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc. let human_version = get_pg_config("--version", pgbin); - return parse_pg_version(&human_version).to_string(); + parse_pg_version(&human_version).to_string() } fn parse_pg_version(human_version: &str) -> &str { diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 79e6158081..af35f71bf2 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -9,8 +9,11 @@ use crate::catalog::SchemaDumpError; use crate::catalog::{get_database_schema, get_dbs_and_roles}; use crate::compute::forward_termination_signal; use crate::compute::{ComputeNode, ComputeState, ParsedSpec}; -use compute_api::requests::ConfigurationRequest; -use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError}; +use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest}; +use compute_api::responses::{ + ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError, + SetRoleGrantsResponse, +}; use anyhow::Result; use hyper::header::CONTENT_TYPE; @@ -98,6 +101,38 @@ async fn routes(req: Request, compute: &Arc) -> Response { + info!("serving /extensions POST request"); + let status = compute.get_status(); + if status != ComputeStatus::Running { + let msg = format!( + "invalid compute status for extensions request: {:?}", + status + ); + error!(msg); + return render_json_error(&msg, StatusCode::PRECONDITION_FAILED); + } + + let request = hyper::body::to_bytes(req.into_body()).await.unwrap(); + let request = serde_json::from_slice::(&request).unwrap(); + let res = compute + .install_extension(&request.extension, &request.database, request.version) + .await; + match res { + Ok(version) => render_json(Body::from( + serde_json::to_string(&ExtensionInstallResult { + extension: request.extension, + version, + }) + .unwrap(), + )), + Err(e) => { + error!("install_extension failed: {}", e); + render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR) + } + } + } + (&Method::GET, "/info") => { let num_cpus = num_cpus::get_physical(); info!("serving /info GET request. num_cpus: {}", num_cpus); @@ -165,6 +200,48 @@ async fn routes(req: Request, compute: &Arc) -> Response { + info!("serving /grants POST request"); + let status = compute.get_status(); + if status != ComputeStatus::Running { + let msg = format!( + "invalid compute status for set_role_grants request: {:?}", + status + ); + error!(msg); + return render_json_error(&msg, StatusCode::PRECONDITION_FAILED); + } + + let request = hyper::body::to_bytes(req.into_body()).await.unwrap(); + let request = serde_json::from_slice::(&request).unwrap(); + + let res = compute + .set_role_grants( + &request.database, + &request.schema, + &request.privileges, + &request.role, + ) + .await; + match res { + Ok(()) => render_json(Body::from( + serde_json::to_string(&SetRoleGrantsResponse { + database: request.database, + schema: request.schema, + role: request.role, + privileges: request.privileges, + }) + .unwrap(), + )), + Err(e) => render_json_error( + &format!("could not grant role privileges to the schema: {e}"), + // TODO: can we filter on role/schema not found errors + // and return appropriate error code? + StatusCode::INTERNAL_SERVER_ERROR, + ), + } + } + // get the list of installed extensions // currently only used in python tests // TODO: call it from cplane diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index e9fa66b323..11eee6ccfd 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -127,6 +127,41 @@ paths: schema: $ref: "#/components/schemas/GenericError" + /grants: + post: + tags: + - Grants + summary: Apply grants to the database. + description: "" + operationId: setRoleGrants + requestBody: + description: Grants request. + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/SetRoleGrantsRequest" + responses: + 200: + description: Grants applied. + content: + application/json: + schema: + $ref: "#/components/schemas/SetRoleGrantsResponse" + 412: + description: | + Compute is not in the right state for processing the request. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + 500: + description: Error occurred during grants application. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + /check_writability: post: tags: @@ -144,6 +179,41 @@ paths: description: Error text or 'true' if check passed. example: "true" + /extensions: + post: + tags: + - Extensions + summary: Install extension if possible. + description: "" + operationId: installExtension + requestBody: + description: Extension name and database to install it to. + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/ExtensionInstallRequest" + responses: + 200: + description: Result from extension installation + content: + application/json: + schema: + $ref: "#/components/schemas/ExtensionInstallResult" + 412: + description: | + Compute is in the wrong state for processing the request. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + 500: + description: Error during extension installation. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + /configure: post: tags: @@ -369,7 +439,7 @@ components: moment, when spec was received. example: "2022-10-12T07:20:50.52Z" status: - $ref: '#/components/schemas/ComputeStatus' + $ref: "#/components/schemas/ComputeStatus" last_active: type: string description: | @@ -409,6 +479,38 @@ components: - configuration example: running + ExtensionInstallRequest: + type: object + required: + - extension + - database + - version + properties: + extension: + type: string + description: Extension name. + example: "pg_session_jwt" + version: + type: string + description: Version of the extension. + example: "1.0.0" + database: + type: string + description: Database name. + example: "neondb" + + ExtensionInstallResult: + type: object + properties: + extension: + description: Name of the extension. + type: string + example: "pg_session_jwt" + version: + description: Version of the extension. + type: string + example: "1.0.0" + InstalledExtensions: type: object properties: @@ -427,6 +529,60 @@ components: n_databases: type: integer + SetRoleGrantsRequest: + type: object + required: + - database + - schema + - privileges + - role + properties: + database: + type: string + description: Database name. + example: "neondb" + schema: + type: string + description: Schema name. + example: "public" + privileges: + type: array + items: + type: string + description: List of privileges to set. + example: ["SELECT", "INSERT"] + role: + type: string + description: Role name. + example: "neon" + + SetRoleGrantsResponse: + type: object + required: + - database + - schema + - privileges + - role + properties: + database: + type: string + description: Database name. + example: "neondb" + schema: + type: string + description: Schema name. + example: "public" + privileges: + type: array + items: + type: string + description: List of privileges set. + example: ["SELECT", "INSERT"] + role: + type: string + description: Role name. + example: "neon" + # # Errors # diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 72578b1f34..877f99bff7 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -1,6 +1,7 @@ use compute_api::responses::{InstalledExtension, InstalledExtensions}; use std::collections::HashMap; use std::collections::HashSet; +use tracing::info; use url::Url; use anyhow::Result; @@ -79,3 +80,23 @@ pub async fn get_installed_extensions(connstr: Url) -> Result Result<()> { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to create runtime"); + let result = rt + .block_on(crate::installed_extensions::get_installed_extensions( + connstr, + )) + .expect("failed to get installed extensions"); + + info!( + "[NEON_EXT_STAT] {}", + serde_json::to_string(&result).expect("failed to serialize extensions list") + ); + + Ok(()) +} diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 43c63e7ef4..b70bd2e1b5 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -20,7 +20,16 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; use reqwest::Method; use serde::{de::DeserializeOwned, Deserialize, Serialize}; -use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock}; +use std::{ + ffi::OsStr, + fs, + net::SocketAddr, + path::PathBuf, + process::ExitStatus, + str::FromStr, + sync::OnceLock, + time::{Duration, Instant}, +}; use tokio::process::Command; use tracing::instrument; use url::Url; @@ -168,16 +177,6 @@ impl StorageController { .expect("non-Unicode path") } - /// PIDFile for the postgres instance used to store storage controller state - fn postgres_pid_file(&self) -> Utf8PathBuf { - Utf8PathBuf::from_path_buf( - self.env - .base_data_dir - .join("storage_controller_postgres.pid"), - ) - .expect("non-Unicode path") - } - /// Find the directory containing postgres subdirectories, such `bin` and `lib` /// /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back @@ -296,6 +295,31 @@ impl StorageController { .map_err(anyhow::Error::new) } + /// Wrapper for the pg_ctl binary, which we spawn as a short-lived subprocess when starting and stopping postgres + async fn pg_ctl(&self, args: I) -> ExitStatus + where + I: IntoIterator, + S: AsRef, + { + let pg_bin_dir = self.get_pg_bin_dir().await.unwrap(); + let bin_path = pg_bin_dir.join("pg_ctl"); + + let pg_lib_dir = self.get_pg_lib_dir().await.unwrap(); + let envs = [ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ]; + + Command::new(bin_path) + .args(args) + .envs(envs) + .spawn() + .expect("Failed to spawn pg_ctl, binary_missing?") + .wait() + .await + .expect("Failed to wait for pg_ctl termination") + } + pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> { let instance_dir = self.storage_controller_instance_dir(start_args.instance_id); if let Err(err) = tokio::fs::create_dir(&instance_dir).await { @@ -404,20 +428,34 @@ impl StorageController { db_start_args ); - background_process::start_process( - "storage_controller_db", - &self.env.base_data_dir, - pg_bin_dir.join("pg_ctl").as_std_path(), - db_start_args, - vec![ - ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), - ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), - ], - background_process::InitialPidFile::Create(self.postgres_pid_file()), - &start_args.start_timeout, - || self.pg_isready(&pg_bin_dir, postgres_port), - ) - .await?; + let db_start_status = self.pg_ctl(db_start_args).await; + let start_timeout: Duration = start_args.start_timeout.into(); + let db_start_deadline = Instant::now() + start_timeout; + if !db_start_status.success() { + return Err(anyhow::anyhow!( + "Failed to start postgres {}", + db_start_status.code().unwrap() + )); + } + + loop { + if Instant::now() > db_start_deadline { + return Err(anyhow::anyhow!("Timed out waiting for postgres to start")); + } + + match self.pg_isready(&pg_bin_dir, postgres_port).await { + Ok(true) => { + tracing::info!("storage controller postgres is now ready"); + break; + } + Ok(false) => { + tokio::time::sleep(Duration::from_millis(100)).await; + } + Err(e) => { + tracing::warn!("Failed to check postgres status: {e}") + } + } + } self.setup_database(postgres_port).await?; } @@ -583,15 +621,10 @@ impl StorageController { } let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); - let pg_bin_dir = self.get_pg_bin_dir().await?; println!("Stopping storage controller database..."); let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"]; - let stop_status = Command::new(pg_bin_dir.join("pg_ctl")) - .args(pg_stop_args) - .spawn()? - .wait() - .await?; + let stop_status = self.pg_ctl(pg_stop_args).await; if !stop_status.success() { match self.is_postgres_running().await { Ok(false) => { @@ -612,14 +645,9 @@ impl StorageController { async fn is_postgres_running(&self) -> anyhow::Result { let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); - let pg_bin_dir = self.get_pg_bin_dir().await?; let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"]; - let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl")) - .args(pg_status_args) - .spawn()? - .wait() - .await?; + let status_exitcode = self.pg_ctl(pg_status_args).await; // pg_ctl status returns this exit code if postgres is not running: in this case it is // fine that stop failed. Otherwise it is an error that stop failed. diff --git a/docs/docker.md b/docs/docker.md index d16311c27b..0914a00082 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -5,7 +5,7 @@ Currently we build two main images: - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). -- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/Dockerfile.compute-node). +- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile). And additional intermediate image: @@ -56,7 +56,7 @@ CREATE TABLE postgres=# insert into t values(1, 1); INSERT 0 1 postgres=# select * from t; - key | value + key | value -----+------- 1 | 1 (1 row) @@ -84,4 +84,4 @@ Access http://localhost:9001 and sign in. - Username: `minio` - Password: `password` -You can see durable pages and WAL data in `neon` bucket. \ No newline at end of file +You can see durable pages and WAL data in `neon` bucket. diff --git a/libs/compute_api/src/lib.rs b/libs/compute_api/src/lib.rs index 210a52d089..f4f3d92fc6 100644 --- a/libs/compute_api/src/lib.rs +++ b/libs/compute_api/src/lib.rs @@ -1,5 +1,6 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] +pub mod privilege; pub mod requests; pub mod responses; pub mod spec; diff --git a/libs/compute_api/src/privilege.rs b/libs/compute_api/src/privilege.rs new file mode 100644 index 0000000000..dc0d870946 --- /dev/null +++ b/libs/compute_api/src/privilege.rs @@ -0,0 +1,35 @@ +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +#[serde(rename_all = "UPPERCASE")] +pub enum Privilege { + Select, + Insert, + Update, + Delete, + Truncate, + References, + Trigger, + Usage, + Create, + Connect, + Temporary, + Execute, +} + +impl Privilege { + pub fn as_str(&self) -> &'static str { + match self { + Privilege::Select => "SELECT", + Privilege::Insert => "INSERT", + Privilege::Update => "UPDATE", + Privilege::Delete => "DELETE", + Privilege::Truncate => "TRUNCATE", + Privilege::References => "REFERENCES", + Privilege::Trigger => "TRIGGER", + Privilege::Usage => "USAGE", + Privilege::Create => "CREATE", + Privilege::Connect => "CONNECT", + Privilege::Temporary => "TEMPORARY", + Privilege::Execute => "EXECUTE", + } + } +} diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index 5896c7dc65..fc3757d981 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -1,6 +1,8 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. - -use crate::spec::ComputeSpec; +use crate::{ + privilege::Privilege, + spec::{ComputeSpec, ExtVersion, PgIdent}, +}; use serde::Deserialize; /// Request of the /configure API @@ -12,3 +14,18 @@ use serde::Deserialize; pub struct ConfigurationRequest { pub spec: ComputeSpec, } + +#[derive(Deserialize, Debug)] +pub struct ExtensionInstallRequest { + pub extension: PgIdent, + pub database: PgIdent, + pub version: ExtVersion, +} + +#[derive(Deserialize, Debug)] +pub struct SetRoleGrantsRequest { + pub database: PgIdent, + pub schema: PgIdent, + pub privileges: Vec, + pub role: PgIdent, +} diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 5023fce003..79234be720 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -6,7 +6,10 @@ use std::fmt::Display; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize, Serializer}; -use crate::spec::{ComputeSpec, Database, Role}; +use crate::{ + privilege::Privilege, + spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role}, +}; #[derive(Serialize, Debug, Deserialize)] pub struct GenericAPIError { @@ -168,3 +171,16 @@ pub struct InstalledExtension { pub struct InstalledExtensions { pub extensions: Vec, } + +#[derive(Clone, Debug, Default, Serialize)] +pub struct ExtensionInstallResult { + pub extension: PgIdent, + pub version: ExtVersion, +} +#[derive(Clone, Debug, Default, Serialize)] +pub struct SetRoleGrantsResponse { + pub database: PgIdent, + pub schema: PgIdent, + pub privileges: Vec, + pub role: PgIdent, +} diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 5903db7055..8a447563dc 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -16,6 +16,9 @@ use remote_storage::RemotePath; /// intended to be used for DB / role names. pub type PgIdent = String; +/// String type alias representing Postgres extension version +pub type ExtVersion = String; + /// Cluster spec or configuration represented as an optional number of /// delta operations + final cluster state description. #[derive(Clone, Debug, Default, Deserialize, Serialize)] diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 24474d4840..896a5d8069 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -102,6 +102,7 @@ pub struct ConfigToml { pub ingest_batch_size: u64, pub max_vectored_read_bytes: MaxVectoredReadBytes, pub image_compression: ImageCompressionAlgorithm, + pub timeline_offloading: bool, pub ephemeral_bytes_per_memory_kb: usize, pub l0_flush: Option, pub virtual_file_io_mode: Option, @@ -385,6 +386,7 @@ impl Default for ConfigToml { NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), image_compression: (DEFAULT_IMAGE_COMPRESSION), + timeline_offloading: false, ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: None, virtual_file_io_mode: None, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 3ec9cac2c3..d0ee4b64d1 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -684,6 +684,25 @@ pub struct TimelineArchivalConfigRequest { pub state: TimelineArchivalState, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct TimelinesInfoAndOffloaded { + pub timelines: Vec, + pub offloaded: Vec, +} + +/// Analog of [`TimelineInfo`] for offloaded timelines. +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct OffloadedTimelineInfo { + pub tenant_id: TenantShardId, + pub timeline_id: TimelineId, + /// Whether the timeline has a parent it has been branched off from or not + pub ancestor_timeline_id: Option, + /// Whether to retain the branch lsn at the ancestor or not + pub ancestor_retain_lsn: Option, + /// The time point when the timeline was archived + pub archived_at: chrono::DateTime, +} + /// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { @@ -743,8 +762,6 @@ pub struct TimelineInfo { // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does // not deny unknown fields by default so it's safe to set the field to some value, though it won't be // read. - /// The last aux file policy being used on this timeline - pub last_aux_file_policy: Option, pub is_archived: Option, } diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs index f6644be635..69832b9a0d 100644 --- a/libs/pageserver_api/src/models/partitioning.rs +++ b/libs/pageserver_api/src/models/partitioning.rs @@ -16,7 +16,7 @@ impl serde::Serialize for Partitioning { { pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace); - impl<'a> serde::Serialize for KeySpace<'a> { + impl serde::Serialize for KeySpace<'_> { fn serialize(&self, serializer: S) -> std::result::Result where S: serde::Serializer, @@ -44,7 +44,7 @@ impl serde::Serialize for Partitioning { pub struct WithDisplay<'a, T>(&'a T); -impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> { +impl serde::Serialize for WithDisplay<'_, T> { fn serialize(&self, serializer: S) -> std::result::Result where S: serde::Serializer, @@ -55,7 +55,7 @@ impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> { pub struct KeyRange<'a>(&'a std::ops::Range); -impl<'a> serde::Serialize for KeyRange<'a> { +impl serde::Serialize for KeyRange<'_> { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 085540e7b9..7419798a60 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -738,6 +738,20 @@ impl PostgresBackend { QueryError::SimulatedConnectionError => { return Err(QueryError::SimulatedConnectionError) } + err @ QueryError::Reconnect => { + // Instruct the client to reconnect, stop processing messages + // from this libpq connection and, finally, disconnect from the + // server side (returning an Err achieves the later). + // + // Note the flushing is done by the caller. + let reconnect_error = short_error(&err); + self.write_message_noflush(&BeMessage::ErrorResponse( + &reconnect_error, + Some(err.pg_error_code()), + ))?; + + return Err(err); + } e => { log_query_error(query_string, &e); let short_error = short_error(&e); @@ -921,12 +935,11 @@ impl PostgresBackendReader { /// A futures::AsyncWrite implementation that wraps all data written to it in CopyData /// messages. /// - pub struct CopyDataWriter<'a, IO> { pgb: &'a mut PostgresBackend, } -impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, IO> { +impl AsyncWrite for CopyDataWriter<'_, IO> { fn poll_write( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index 900083ea7f..9d3031d699 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -2,6 +2,7 @@ use once_cell::sync::Lazy; use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError}; use pq_proto::{BeMessage, RowDescriptor}; +use rustls::crypto::aws_lc_rs; use std::io::Cursor; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; @@ -92,10 +93,13 @@ static CERT: Lazy> = Lazy::new(|| { async fn simple_select_ssl() { let (client_sock, server_sock) = make_tcp_pair().await; - let server_cfg = rustls::ServerConfig::builder() - .with_no_client_auth() - .with_single_cert(vec![CERT.clone()], KEY.clone_key()) - .unwrap(); + let server_cfg = + rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .expect("aws_lc_rs should support the default protocol versions") + .with_no_client_auth() + .with_single_cert(vec![CERT.clone()], KEY.clone_key()) + .unwrap(); let tls_config = Some(Arc::new(server_cfg)); let pgbackend = PostgresBackend::new(server_sock, AuthType::Trust, tls_config).expect("pgbackend creation"); @@ -105,13 +109,16 @@ async fn simple_select_ssl() { pgbackend.run(&mut handler, &CancellationToken::new()).await }); - let client_cfg = rustls::ClientConfig::builder() - .with_root_certificates({ - let mut store = rustls::RootCertStore::empty(); - store.add(CERT.clone()).unwrap(); - store - }) - .with_no_client_auth(); + let client_cfg = + rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .expect("aws_lc_rs should support the default protocol versions") + .with_root_certificates({ + let mut store = rustls::RootCertStore::empty(); + store.add(CERT.clone()).unwrap(); + store + }) + .with_no_client_auth(); let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg); let tls_connect = >::make_tls_connect( &mut make_tls_connect, diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index a01191bd5d..9ffaaba584 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -727,7 +727,7 @@ pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000"; pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01"; pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000"; -impl<'a> BeMessage<'a> { +impl BeMessage<'_> { /// Serialize `message` to the given `buf`. /// Apart from smart memory managemet, BytesMut is good here as msg len /// precedes its body and it is handy to write it down first and then fill diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index c6466237bf..719608dd5f 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -19,7 +19,12 @@ mod simulate_failures; mod support; use std::{ - collections::HashMap, fmt::Debug, num::NonZeroU32, ops::Bound, pin::Pin, sync::Arc, + collections::HashMap, + fmt::Debug, + num::NonZeroU32, + ops::Bound, + pin::{pin, Pin}, + sync::Arc, time::SystemTime, }; @@ -28,6 +33,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use bytes::Bytes; use futures::{stream::Stream, StreamExt}; +use itertools::Itertools as _; use serde::{Deserialize, Serialize}; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; @@ -261,7 +267,7 @@ pub trait RemoteStorage: Send + Sync + 'static { max_keys: Option, cancel: &CancellationToken, ) -> Result { - let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel)); + let mut stream = pin!(self.list_streaming(prefix, mode, max_keys, cancel)); let mut combined = stream.next().await.expect("At least one item required")?; while let Some(list) = stream.next().await { let list = list?; @@ -324,6 +330,35 @@ pub trait RemoteStorage: Send + Sync + 'static { cancel: &CancellationToken, ) -> anyhow::Result<()>; + /// Deletes all objects matching the given prefix. + /// + /// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will + /// delete /a/b, /a/b/*, /a/bc, /a/bc/*, etc. + /// + /// If the operation fails because of timeout or cancellation, the root cause of the error will + /// be set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went + /// through. + async fn delete_prefix( + &self, + prefix: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let mut stream = + pin!(self.list_streaming(Some(prefix), ListingMode::NoDelimiter, None, cancel)); + while let Some(result) = stream.next().await { + let keys = match result { + Ok(listing) if listing.keys.is_empty() => continue, + Ok(listing) => listing.keys.into_iter().map(|o| o.key).collect_vec(), + Err(DownloadError::Cancelled) => return Err(TimeoutOrCancel::Cancel.into()), + Err(DownloadError::Timeout) => return Err(TimeoutOrCancel::Timeout.into()), + Err(err) => return Err(err.into()), + }; + tracing::info!("Deleting {} keys from remote storage", keys.len()); + self.delete_objects(&keys, cancel).await?; + } + Ok(()) + } + /// Copy a remote object inside a bucket from one path to another. async fn copy( &self, @@ -488,6 +523,20 @@ impl GenericRemoteStorage> { } } + /// See [`RemoteStorage::delete_prefix`] + pub async fn delete_prefix( + &self, + prefix: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + match self { + Self::LocalFs(s) => s.delete_prefix(prefix, cancel).await, + Self::AwsS3(s) => s.delete_prefix(prefix, cancel).await, + Self::AzureBlob(s) => s.delete_prefix(prefix, cancel).await, + Self::Unreliable(s) => s.delete_prefix(prefix, cancel).await, + } + } + /// See [`RemoteStorage::copy`] pub async fn copy_object( &self, diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index e6f33fc3f8..d5da1d48e9 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -199,6 +199,138 @@ async fn list_no_delimiter_works( Ok(()) } +/// Tests that giving a partial prefix returns all matches (e.g. "/foo" yields "/foobar/baz"), +/// but only with NoDelimiter. +#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)] +#[tokio::test] +async fn list_partial_prefix( + ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs, +) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx, + MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()), + MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => { + anyhow::bail!("S3 init failed: {e:?}") + } + }; + + let cancel = CancellationToken::new(); + let test_client = Arc::clone(&ctx.enabled.client); + + // Prefix "fold" should match all "folder{i}" directories with NoDelimiter. + let objects: HashSet<_> = test_client + .list( + Some(&RemotePath::from_string("fold")?), + ListingMode::NoDelimiter, + None, + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + assert_eq!(&objects, &ctx.remote_blobs); + + // Prefix "fold" matches nothing with WithDelimiter. + let objects: HashSet<_> = test_client + .list( + Some(&RemotePath::from_string("fold")?), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + assert!(objects.is_empty()); + + // Prefix "" matches everything. + let objects: HashSet<_> = test_client + .list( + Some(&RemotePath::from_string("")?), + ListingMode::NoDelimiter, + None, + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + assert_eq!(&objects, &ctx.remote_blobs); + + // Prefix "" matches nothing with WithDelimiter. + let objects: HashSet<_> = test_client + .list( + Some(&RemotePath::from_string("")?), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + assert!(objects.is_empty()); + + // Prefix "foo" matches nothing. + let objects: HashSet<_> = test_client + .list( + Some(&RemotePath::from_string("foo")?), + ListingMode::NoDelimiter, + None, + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + assert!(objects.is_empty()); + + // Prefix "folder2/blob" matches. + let objects: HashSet<_> = test_client + .list( + Some(&RemotePath::from_string("folder2/blob")?), + ListingMode::NoDelimiter, + None, + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + let expect: HashSet<_> = ctx + .remote_blobs + .iter() + .filter(|o| o.get_path().starts_with("folder2")) + .cloned() + .collect(); + assert_eq!(&objects, &expect); + + // Prefix "folder2/foo" matches nothing. + let objects: HashSet<_> = test_client + .list( + Some(&RemotePath::from_string("folder2/foo")?), + ListingMode::NoDelimiter, + None, + &cancel, + ) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + assert!(objects.is_empty()); + + Ok(()) +} + #[test_context(MaybeEnabledStorage)] #[tokio::test] async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { @@ -265,6 +397,80 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<( Ok(()) } +/// Tests that delete_prefix() will delete all objects matching a prefix, including +/// partial prefixes (i.e. "/foo" matches "/foobar"). +#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)] +#[tokio::test] +async fn delete_prefix(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx, + MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()), + MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => { + anyhow::bail!("S3 init failed: {e:?}") + } + }; + + let cancel = CancellationToken::new(); + let test_client = Arc::clone(&ctx.enabled.client); + + /// Asserts that the S3 listing matches the given paths. + macro_rules! assert_list { + ($expect:expr) => {{ + let listing = test_client + .list(None, ListingMode::NoDelimiter, None, &cancel) + .await? + .keys + .into_iter() + .map(|o| o.key) + .collect(); + assert_eq!($expect, listing); + }}; + } + + // We start with the full set of uploaded files. + let mut expect = ctx.remote_blobs.clone(); + + // Deleting a non-existing prefix should do nothing. + test_client + .delete_prefix(&RemotePath::from_string("xyz")?, &cancel) + .await?; + assert_list!(expect); + + // Prefixes are case-sensitive. + test_client + .delete_prefix(&RemotePath::from_string("Folder")?, &cancel) + .await?; + assert_list!(expect); + + // Deleting a path which overlaps with an existing object should do nothing. We pick the first + // path in the set as our common prefix. + let path = expect.iter().next().expect("empty set").clone().join("xyz"); + test_client.delete_prefix(&path, &cancel).await?; + assert_list!(expect); + + // Deleting an exact path should work. We pick the first path in the set. + let path = expect.iter().next().expect("empty set").clone(); + test_client.delete_prefix(&path, &cancel).await?; + expect.remove(&path); + assert_list!(expect); + + // Deleting a prefix should delete all matching objects. + test_client + .delete_prefix(&RemotePath::from_string("folder0/blob_")?, &cancel) + .await?; + expect.retain(|p| !p.get_path().as_str().starts_with("folder0/")); + assert_list!(expect); + + // Deleting a common prefix should delete all objects. + test_client + .delete_prefix(&RemotePath::from_string("fold")?, &cancel) + .await?; + expect.clear(); + assert_list!(expect); + + Ok(()) +} + #[test_context(MaybeEnabledStorage)] #[tokio::test] async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs index 0de2890bb4..25ebb1c3d8 100644 --- a/libs/tenant_size_model/src/svg.rs +++ b/libs/tenant_size_model/src/svg.rs @@ -97,7 +97,7 @@ pub fn draw_svg( Ok(result) } -impl<'a> SvgDraw<'a> { +impl SvgDraw<'_> { fn calculate_svg_layout(&mut self) { // Find x scale let segments = &self.storage.segments; diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs index e6fdf9be45..2168beee88 100644 --- a/libs/tracing-utils/src/http.rs +++ b/libs/tracing-utils/src/http.rs @@ -82,7 +82,7 @@ where fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context { struct HeaderExtractor<'a>(&'a HeaderMap); - impl<'a> opentelemetry::propagation::Extractor for HeaderExtractor<'a> { + impl opentelemetry::propagation::Extractor for HeaderExtractor<'_> { fn get(&self, key: &str) -> Option<&str> { self.0.get(key).and_then(|value| value.to_str().ok()) } diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 06d5c27ebf..3ec2c130bd 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -37,7 +37,7 @@ impl<'de> Deserialize<'de> for Lsn { is_human_readable_deserializer: bool, } - impl<'de> Visitor<'de> for LsnVisitor { + impl Visitor<'_> for LsnVisitor { type Value = Lsn; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs index c3e2fba20c..ab9ebb3c5a 100644 --- a/libs/utils/src/poison.rs +++ b/libs/utils/src/poison.rs @@ -73,7 +73,7 @@ impl Poison { /// and subsequent calls to [`Poison::check_and_arm`] will fail with an error. pub struct Guard<'a, T>(&'a mut Poison); -impl<'a, T> Guard<'a, T> { +impl Guard<'_, T> { pub fn data(&self) -> &T { &self.0.data } @@ -94,7 +94,7 @@ impl<'a, T> Guard<'a, T> { } } -impl<'a, T> Drop for Guard<'a, T> { +impl Drop for Guard<'_, T> { fn drop(&mut self) { match self.0.state { State::Clean => { diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index d146010b41..782cddc599 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -164,7 +164,7 @@ impl TenantShardId { } } -impl<'a> std::fmt::Display for ShardSlug<'a> { +impl std::fmt::Display for ShardSlug<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs index 01750b2aef..6700f86e4a 100644 --- a/libs/utils/src/simple_rcu.rs +++ b/libs/utils/src/simple_rcu.rs @@ -152,7 +152,7 @@ pub struct RcuWriteGuard<'a, V> { inner: RwLockWriteGuard<'a, RcuInner>, } -impl<'a, V> Deref for RcuWriteGuard<'a, V> { +impl Deref for RcuWriteGuard<'_, V> { type Target = V; fn deref(&self) -> &V { @@ -160,7 +160,7 @@ impl<'a, V> Deref for RcuWriteGuard<'a, V> { } } -impl<'a, V> RcuWriteGuard<'a, V> { +impl RcuWriteGuard<'_, V> { /// /// Store a new value. The new value will be written to the Rcu immediately, /// and will be immediately seen by any `read` calls that start afterwards. diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index dc711fb028..66c2065554 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -219,7 +219,7 @@ impl<'a, T> CountWaitingInitializers<'a, T> { } } -impl<'a, T> Drop for CountWaitingInitializers<'a, T> { +impl Drop for CountWaitingInitializers<'_, T> { fn drop(&mut self) { self.0.initializers.fetch_sub(1, Ordering::Relaxed); } @@ -250,7 +250,7 @@ impl std::ops::DerefMut for Guard<'_, T> { } } -impl<'a, T> Guard<'a, T> { +impl Guard<'_, T> { /// Take the current value, and a new permit for it's deinitialization. /// /// The permit will be on a semaphore part of the new internal value, and any following diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index d24c81ad0b..add2fa7920 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -184,23 +184,23 @@ mod tests { struct MemoryIdentity<'a>(&'a dyn Extractor); - impl<'a> MemoryIdentity<'a> { + impl MemoryIdentity<'_> { fn as_ptr(&self) -> *const () { self.0 as *const _ as *const () } } - impl<'a> PartialEq for MemoryIdentity<'a> { + impl PartialEq for MemoryIdentity<'_> { fn eq(&self, other: &Self) -> bool { self.as_ptr() == other.as_ptr() } } - impl<'a> Eq for MemoryIdentity<'a> {} - impl<'a> Hash for MemoryIdentity<'a> { + impl Eq for MemoryIdentity<'_> {} + impl Hash for MemoryIdentity<'_> { fn hash(&self, state: &mut H) { self.as_ptr().hash(state); } } - impl<'a> fmt::Debug for MemoryIdentity<'a> { + impl fmt::Debug for MemoryIdentity<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:p}: {}", self.as_ptr(), self.0.id()) } diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 821c8008a9..d98b23acce 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -164,7 +164,11 @@ fn criterion_benchmark(c: &mut Criterion) { let conf: &'static PageServerConf = Box::leak(Box::new( pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()), )); - virtual_file::init(16384, virtual_file::io_engine_for_bench()); + virtual_file::init( + 16384, + virtual_file::io_engine_for_bench(), + conf.virtual_file_io_mode, + ); page_cache::init(conf.page_cache_size); { diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 8ed1d16082..9dbb6ecedf 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -133,7 +133,7 @@ enum LazyLoadLayer<'a, E: CompactionJobExecutor> { Loaded(VecDeque<>::DeltaEntry<'a>>), Unloaded(&'a E::DeltaLayer), } -impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> { +impl LazyLoadLayer<'_, E> { fn min_key(&self) -> E::Key { match self { Self::Loaded(entries) => entries.front().unwrap().key(), @@ -147,23 +147,23 @@ impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> { } } } -impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> { +impl PartialOrd for LazyLoadLayer<'_, E> { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> { +impl Ord for LazyLoadLayer<'_, E> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { // reverse order so that we get a min-heap (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn())) } } -impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> { +impl PartialEq for LazyLoadLayer<'_, E> { fn eq(&self, other: &Self) -> bool { self.cmp(other) == std::cmp::Ordering::Equal } } -impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {} +impl Eq for LazyLoadLayer<'_, E> {} type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result>>; diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs index 20018846f8..6cce2844c7 100644 --- a/pageserver/ctl/src/index_part.rs +++ b/pageserver/ctl/src/index_part.rs @@ -11,7 +11,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { match cmd { IndexPartCmd::Dump { path } => { let bytes = tokio::fs::read(path).await.context("read file")?; - let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?; + let des: IndexPart = IndexPart::from_json_bytes(&bytes).context("deserialize")?; let output = serde_json::to_string_pretty(&des).context("serialize output")?; println!("{output}"); Ok(()) diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index 151b94cf62..7dd2a5d05c 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -7,6 +7,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::task_mgr::TaskKind; use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; +use pageserver::virtual_file::api::IoMode; use std::cmp::Ordering; use std::collections::BinaryHeap; use std::ops::Range; @@ -152,7 +153,11 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. - pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + pageserver::virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + IoMode::preferred(), + ); pageserver::page_cache::init(100); let mut total_delta_layers = 0usize; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index fd948bf2ef..c0b2b6ae89 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -11,6 +11,7 @@ use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary}; use pageserver::tenant::storage_layer::{delta_layer, image_layer}; use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer}; use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; +use pageserver::virtual_file::api::IoMode; use pageserver::{page_cache, virtual_file}; use pageserver::{ repository::{Key, KEY_SIZE}, @@ -59,7 +60,11 @@ pub(crate) enum LayerCmd { async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result<()> { let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); - virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + IoMode::preferred(), + ); page_cache::init(100); let file = VirtualFile::open(path, ctx).await?; let file_id = page_cache::next_file_id(); @@ -190,7 +195,11 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { new_tenant_id, new_timeline_id, } => { - pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + pageserver::virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + IoMode::preferred(), + ); pageserver::page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index c96664d346..f506caec5b 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -24,7 +24,7 @@ use pageserver::{ page_cache, task_mgr::TaskKind, tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, - virtual_file, + virtual_file::{self, api::IoMode}, }; use pageserver_api::shard::TenantShardId; use postgres_ffi::ControlFileData; @@ -205,7 +205,11 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> { async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> { // Basic initialization of things that don't change after startup - virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + IoMode::preferred(), + ); page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); dump_layerfile_from_path(path, true, &ctx).await diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index f71a3d2653..c6659345f9 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -167,7 +167,11 @@ fn main() -> anyhow::Result<()> { let scenario = failpoint_support::init(); // Basic initialization of things that don't change after startup - virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine); + virtual_file::init( + conf.max_file_descriptors, + conf.virtual_file_io_engine, + conf.virtual_file_io_mode, + ); page_cache::init(conf.page_cache_size); start_pageserver(launch_ts, conf).context("Failed to start pageserver")?; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8db78285e4..06d4326459 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -164,6 +164,9 @@ pub struct PageServerConf { pub image_compression: ImageCompressionAlgorithm, + /// Whether to offload archived timelines automatically + pub timeline_offloading: bool, + /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this /// is exceeded, we start proactively closing ephemeral layers to limit the total amount /// of ephemeral data. @@ -321,6 +324,7 @@ impl PageServerConf { ingest_batch_size, max_vectored_read_bytes, image_compression, + timeline_offloading, ephemeral_bytes_per_memory_kb, l0_flush, virtual_file_io_mode, @@ -364,6 +368,7 @@ impl PageServerConf { ingest_batch_size, max_vectored_read_bytes, image_compression, + timeline_offloading, ephemeral_bytes_per_memory_kb, // ------------------------------------------------------------ diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 0325ee403a..1eb25d337b 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -198,7 +198,7 @@ fn serialize_in_chunks<'a>( } } - impl<'a> ExactSizeIterator for Iter<'a> {} + impl ExactSizeIterator for Iter<'_> {} let buffer = bytes::BytesMut::new(); let inner = input.chunks(chunk_size); diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index a58fa2c0b1..ca44fbe6ae 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -654,7 +654,7 @@ impl std::fmt::Debug for EvictionCandidate { let ts = chrono::DateTime::::from(self.last_activity_ts); let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true); struct DisplayIsDebug<'a, T>(&'a T); - impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> { + impl std::fmt::Debug for DisplayIsDebug<'_, T> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } @@ -1218,16 +1218,7 @@ mod filesystem_level_usage { let stat = Statvfs::get(tenants_dir, mock_config) .context("statvfs failed, presumably directory got unlinked")?; - // https://unix.stackexchange.com/a/703650 - let blocksize = if stat.fragment_size() > 0 { - stat.fragment_size() - } else { - stat.block_size() - }; - - // use blocks_available (b_avail) since, pageserver runs as unprivileged user - let avail_bytes = stat.blocks_available() * blocksize; - let total_bytes = stat.blocks() * blocksize; + let (avail_bytes, total_bytes) = stat.get_avail_total_bytes(); Ok(Usage { config, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 36a6ed427b..2490bf5f20 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -18,7 +18,6 @@ use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::virtual_file::IoMode; -use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; use pageserver_api::models::IngestAuxFilesRequest; use pageserver_api::models::ListAuxFilesRequest; @@ -27,6 +26,7 @@ use pageserver_api::models::LocationConfigListResponse; use pageserver_api::models::LocationConfigMode; use pageserver_api::models::LsnLease; use pageserver_api::models::LsnLeaseRequest; +use pageserver_api::models::OffloadedTimelineInfo; use pageserver_api::models::ShardParameters; use pageserver_api::models::TenantDetails; use pageserver_api::models::TenantLocationConfigRequest; @@ -38,6 +38,7 @@ use pageserver_api::models::TenantShardSplitRequest; use pageserver_api::models::TenantShardSplitResponse; use pageserver_api::models::TenantSorting; use pageserver_api::models::TimelineArchivalConfigRequest; +use pageserver_api::models::TimelinesInfoAndOffloaded; use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::TopTenantShardsRequest; use pageserver_api::models::TopTenantShardsResponse; @@ -82,6 +83,7 @@ use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; use crate::tenant::GetTimelineError; +use crate::tenant::OffloadedTimeline; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::{disk_usage_eviction_task, tenant}; use pageserver_api::models::{ @@ -474,12 +476,28 @@ async fn build_timeline_info_common( is_archived: Some(is_archived), walreceiver_status, - - last_aux_file_policy: timeline.last_aux_file_policy.load(), }; Ok(info) } +fn build_timeline_offloaded_info(offloaded: &Arc) -> OffloadedTimelineInfo { + let &OffloadedTimeline { + tenant_shard_id, + timeline_id, + ancestor_retain_lsn, + ancestor_timeline_id, + archived_at, + .. + } = offloaded.as_ref(); + OffloadedTimelineInfo { + tenant_id: tenant_shard_id, + timeline_id, + ancestor_retain_lsn, + ancestor_timeline_id, + archived_at: archived_at.and_utc(), + } +} + // healthcheck handler async fn status_handler( request: Request, @@ -646,7 +664,7 @@ async fn timeline_list_handler( ) .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id)) .await - .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}") + .context("Failed to build timeline info") .map_err(ApiError::InternalServerError)?; response_data.push(timeline_info); @@ -661,6 +679,62 @@ async fn timeline_list_handler( json_response(StatusCode::OK, response_data) } +async fn timeline_and_offloaded_list_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let include_non_incremental_logical_size: Option = + parse_query_param(&request, "include-non-incremental-logical-size")?; + let force_await_initial_logical_size: Option = + parse_query_param(&request, "force-await-initial-logical-size")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + + let response_data = async { + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let (timelines, offloadeds) = tenant.list_timelines_and_offloaded(); + + let mut timeline_infos = Vec::with_capacity(timelines.len()); + for timeline in timelines { + let timeline_info = build_timeline_info( + &timeline, + include_non_incremental_logical_size.unwrap_or(false), + force_await_initial_logical_size.unwrap_or(false), + &ctx, + ) + .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id)) + .await + .context("Failed to build timeline info") + .map_err(ApiError::InternalServerError)?; + + timeline_infos.push(timeline_info); + } + let offloaded_infos = offloadeds + .into_iter() + .map(|offloaded| build_timeline_offloaded_info(&offloaded)) + .collect::>(); + let res = TimelinesInfoAndOffloaded { + timelines: timeline_infos, + offloaded: offloaded_infos, + }; + Ok::(res) + } + .instrument(info_span!("timeline_and_offloaded_list", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug())) + .await?; + + json_response(StatusCode::OK, response_data) +} + async fn timeline_preserve_initdb_handler( request: Request, _cancel: CancellationToken, @@ -2254,7 +2328,7 @@ async fn tenant_scan_remote_handler( %timeline_id)) .await { - Ok((index_part, index_generation)) => { + Ok((index_part, index_generation, _index_mtime)) => { tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn()); generation = std::cmp::max(generation, index_generation); @@ -2399,31 +2473,6 @@ async fn post_tracing_event_handler( json_response(StatusCode::OK, ()) } -async fn force_aux_policy_switch_handler( - mut r: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - check_permission(&r, None)?; - let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?; - let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?; - let policy: AuxFilePolicy = json_request(&mut r).await?; - - let state = get_state(&r); - - let tenant = state - .tenant_manager - .get_attached_tenant_shard(tenant_shard_id)?; - tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - let timeline = - active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) - .await?; - timeline - .do_switch_aux_policy(policy) - .map_err(ApiError::InternalServerError)?; - - json_response(StatusCode::OK, ()) -} - async fn put_io_engine_handler( mut r: Request, _cancel: CancellationToken, @@ -3021,6 +3070,9 @@ pub fn make_router( .get("/v1/tenant/:tenant_shard_id/timeline", |r| { api_handler(r, timeline_list_handler) }) + .get("/v1/tenant/:tenant_shard_id/timeline_and_offloaded", |r| { + api_handler(r, timeline_and_offloaded_list_handler) + }) .post("/v1/tenant/:tenant_shard_id/timeline", |r| { api_handler(r, timeline_create_handler) }) @@ -3136,10 +3188,6 @@ pub fn make_router( ) .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler)) - .put( - "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch", - |r| api_handler(r, force_aux_policy_switch_handler), - ) .get("/v1/utilization", |r| api_handler(r, get_utilization)) .post( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files", diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index b76efa5b48..8f697558d6 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1189,7 +1189,7 @@ struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { op: SmgrQueryType, } -impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> { +impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> { fn drop(&mut self) { let elapsed = self.start.elapsed(); let ex_throttled = self @@ -1560,7 +1560,7 @@ impl BasebackupQueryTime { } } -impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { +impl BasebackupQueryTimeOngoingRecording<'_, '_> { pub(crate) fn observe(self, res: &Result) { let elapsed = self.start.elapsed(); let ex_throttled = self @@ -2092,6 +2092,7 @@ pub(crate) struct WalIngestMetrics { pub(crate) records_received: IntCounter, pub(crate) records_committed: IntCounter, pub(crate) records_filtered: IntCounter, + pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter, } pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { @@ -2115,6 +2116,11 @@ pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMet "Number of WAL records filtered out due to sharding" ) .expect("failed to define a metric"), + gap_blocks_zeroed_on_rel_extend: register_int_counter!( + "pageserver_gap_blocks_zeroed_on_rel_extend", + "Total number of zero gap blocks written on relation extends" + ) + .expect("failed to define a metric"), }); pub(crate) static WAL_REDO_TIME: Lazy = Lazy::new(|| { diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index f386c825b8..45bf02362a 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -82,6 +82,7 @@ use once_cell::sync::OnceCell; use crate::{ context::RequestContext, metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics}, + virtual_file::{IoBufferMut, IoPageSlice}, }; static PAGE_CACHE: OnceCell = OnceCell::new(); @@ -144,7 +145,7 @@ struct SlotInner { key: Option, // for `coalesce_readers_permit` permit: std::sync::Mutex>, - buf: &'static mut [u8; PAGE_SZ], + buf: IoPageSlice<'static>, } impl Slot { @@ -234,13 +235,13 @@ impl std::ops::Deref for PageReadGuard<'_> { type Target = [u8; PAGE_SZ]; fn deref(&self) -> &Self::Target { - self.slot_guard.buf + self.slot_guard.buf.deref() } } impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> { fn as_ref(&self) -> &[u8; PAGE_SZ] { - self.slot_guard.buf + self.slot_guard.buf.as_ref() } } @@ -266,7 +267,7 @@ enum PageWriteGuardState<'i> { impl std::ops::DerefMut for PageWriteGuard<'_> { fn deref_mut(&mut self) -> &mut Self::Target { match &mut self.state { - PageWriteGuardState::Invalid { inner, _permit } => inner.buf, + PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref_mut(), PageWriteGuardState::Downgraded => unreachable!(), } } @@ -277,7 +278,7 @@ impl std::ops::Deref for PageWriteGuard<'_> { fn deref(&self) -> &Self::Target { match &self.state { - PageWriteGuardState::Invalid { inner, _permit } => inner.buf, + PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref(), PageWriteGuardState::Downgraded => unreachable!(), } } @@ -643,7 +644,7 @@ impl PageCache { // We could use Vec::leak here, but that potentially also leaks // uninitialized reserved capacity. With into_boxed_slice and Box::leak // this is avoided. - let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice()); + let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak(); let size_metrics = &crate::metrics::PAGE_CACHE_SIZE; size_metrics.max_bytes.set_page_sz(num_pages); @@ -652,7 +653,8 @@ impl PageCache { let slots = page_buffer .chunks_exact_mut(PAGE_SZ) .map(|chunk| { - let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap(); + // SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned. + let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) }; Slot { inner: tokio::sync::RwLock::new(SlotInner { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index afb2f92ff8..62b14cb83e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1326,22 +1326,22 @@ where .for_command(ComputeCommandKind::Basebackup) .inc(); - let lsn = if let Some(lsn_str) = params.get(2) { - Some( - Lsn::from_str(lsn_str) - .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?, - ) - } else { - None - }; - - let gzip = match params.get(3) { - Some(&"--gzip") => true, - None => false, - Some(third_param) => { - return Err(QueryError::Other(anyhow::anyhow!( - "Parameter in position 3 unknown {third_param}", - ))) + let (lsn, gzip) = match (params.get(2), params.get(3)) { + (None, _) => (None, false), + (Some(&"--gzip"), _) => (None, true), + (Some(lsn_str), gzip_str_opt) => { + let lsn = Lsn::from_str(lsn_str) + .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?; + let gzip = match gzip_str_opt { + Some(&"--gzip") => true, + None => false, + Some(third_param) => { + return Err(QueryError::Other(anyhow::anyhow!( + "Parameter in position 3 unknown {third_param}", + ))) + } + }; + (Some(lsn), gzip) } }; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 900da5beab..f2a11e65c1 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -22,7 +22,6 @@ use pageserver_api::key::{ CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, }; use pageserver_api::keyspace::SparseKeySpace; -use pageserver_api::models::AuxFilePolicy; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; @@ -33,7 +32,7 @@ use std::ops::ControlFlow; use std::ops::Range; use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, trace, warn}; +use tracing::{debug, trace, warn}; use utils::bin_ser::DeserializeError; use utils::pausable_failpoint; use utils::{bin_ser::BeSer, lsn::Lsn}; @@ -677,21 +676,6 @@ impl Timeline { self.get(CHECKPOINT_KEY, lsn, ctx).await } - async fn list_aux_files_v1( - &self, - lsn: Lsn, - ctx: &RequestContext, - ) -> Result, PageReconstructError> { - match self.get(AUX_FILES_KEY, lsn, ctx).await { - Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files), - Err(e) => { - // This is expected: historical databases do not have the key. - debug!("Failed to get info about AUX files: {}", e); - Ok(HashMap::new()) - } - } - } - async fn list_aux_files_v2( &self, lsn: Lsn, @@ -722,10 +706,7 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) -> Result<(), PageReconstructError> { - let current_policy = self.last_aux_file_policy.load(); - if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy { - self.list_aux_files_v2(lsn, ctx).await?; - } + self.list_aux_files_v2(lsn, ctx).await?; Ok(()) } @@ -734,51 +715,7 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) -> Result, PageReconstructError> { - let current_policy = self.last_aux_file_policy.load(); - match current_policy { - Some(AuxFilePolicy::V1) => { - let res = self.list_aux_files_v1(lsn, ctx).await?; - let empty_str = if res.is_empty() { ", empty" } else { "" }; - warn!( - "this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})" - ); - Ok(res) - } - None => { - let res = self.list_aux_files_v1(lsn, ctx).await?; - if !res.is_empty() { - warn!("this timeline is using deprecated aux file policy V1 (policy=None)"); - } - Ok(res) - } - Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await, - Some(AuxFilePolicy::CrossValidation) => { - let v1_result = self.list_aux_files_v1(lsn, ctx).await; - let v2_result = self.list_aux_files_v2(lsn, ctx).await; - match (v1_result, v2_result) { - (Ok(v1), Ok(v2)) => { - if v1 != v2 { - tracing::error!( - "unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}" - ); - return Err(PageReconstructError::Other(anyhow::anyhow!( - "unmatched aux file v1 v2 result" - ))); - } - Ok(v1) - } - (Ok(_), Err(v2)) => { - tracing::error!("aux file v1 returns Ok while aux file v2 returns an err"); - Err(v2) - } - (Err(v1), Ok(_)) => { - tracing::error!("aux file v2 returns Ok while aux file v1 returns an err"); - Err(v1) - } - (Err(_), Err(v2)) => Err(v2), - } - } - } + self.list_aux_files_v2(lsn, ctx).await } pub(crate) async fn get_replorigins( @@ -954,9 +891,6 @@ impl Timeline { result.add_key(CONTROLFILE_KEY); result.add_key(CHECKPOINT_KEY); - if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() { - result.add_key(AUX_FILES_KEY); - } // Add extra keyspaces in the test cases. Some test cases write keys into the storage without // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace` @@ -1166,9 +1100,6 @@ impl<'a> DatadirModification<'a> { self.pending_directory_entries.push((DirectoryKind::Db, 0)); self.put(DBDIR_KEY, Value::Image(buf.into())); - // Create AuxFilesDirectory - self.init_aux_dir()?; - let buf = if self.tline.pg_version >= 17 { TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 { xids: HashSet::new(), @@ -1347,9 +1278,6 @@ impl<'a> DatadirModification<'a> { // 'true', now write the updated 'dbdirs' map back. let buf = DbDirectory::ser(&dbdir)?; self.put(DBDIR_KEY, Value::Image(buf.into())); - - // Create AuxFilesDirectory as well - self.init_aux_dir()?; } if r.is_none() { // Create RelDirectory @@ -1726,200 +1654,60 @@ impl<'a> DatadirModification<'a> { Ok(()) } - pub fn init_aux_dir(&mut self) -> anyhow::Result<()> { - if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() { - return Ok(()); - } - let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { - files: HashMap::new(), - })?; - self.pending_directory_entries - .push((DirectoryKind::AuxFiles, 0)); - self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); - Ok(()) - } - pub async fn put_file( &mut self, path: &str, content: &[u8], ctx: &RequestContext, ) -> anyhow::Result<()> { - let switch_policy = self.tline.get_switch_aux_file_policy(); - - let policy = { - let current_policy = self.tline.last_aux_file_policy.load(); - // Allowed switch path: - // * no aux files -> v1/v2/cross-validation - // * cross-validation->v2 - - let current_policy = if current_policy.is_none() { - // This path will only be hit once per tenant: we will decide the final policy in this code block. - // The next call to `put_file` will always have `last_aux_file_policy != None`. - let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); - let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?; - if aux_files_key_v1.is_empty() { - None - } else { - warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)"); - self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?; - Some(AuxFilePolicy::V1) - } - } else { - current_policy - }; - - if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) { - self.tline.do_switch_aux_policy(switch_policy)?; - info!(current=?current_policy, next=?switch_policy, "switching aux file policy"); - switch_policy - } else { - // This branch handles non-valid migration path, and the case that switch_policy == current_policy. - // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit. - current_policy.unwrap_or(AuxFilePolicy::default_tenant_config()) - } + let key = aux_file::encode_aux_file_key(path); + // retrieve the key from the engine + let old_val = match self.get(key, ctx).await { + Ok(val) => Some(val), + Err(PageReconstructError::MissingKey(_)) => None, + Err(e) => return Err(e.into()), }; - - if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy { - let key = aux_file::encode_aux_file_key(path); - // retrieve the key from the engine - let old_val = match self.get(key, ctx).await { - Ok(val) => Some(val), - Err(PageReconstructError::MissingKey(_)) => None, - Err(e) => return Err(e.into()), - }; - let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val { - aux_file::decode_file_value(old_val)? + let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val { + aux_file::decode_file_value(old_val)? + } else { + Vec::new() + }; + let mut other_files = Vec::with_capacity(files.len()); + let mut modifying_file = None; + for file @ (p, content) in files { + if path == p { + assert!( + modifying_file.is_none(), + "duplicated entries found for {}", + path + ); + modifying_file = Some(content); } else { - Vec::new() - }; - let mut other_files = Vec::with_capacity(files.len()); - let mut modifying_file = None; - for file @ (p, content) in files { - if path == p { - assert!( - modifying_file.is_none(), - "duplicated entries found for {}", - path - ); - modifying_file = Some(content); - } else { - other_files.push(file); - } + other_files.push(file); } - let mut new_files = other_files; - match (modifying_file, content.is_empty()) { - (Some(old_content), false) => { - self.tline - .aux_file_size_estimator - .on_update(old_content.len(), content.len()); - new_files.push((path, content)); - } - (Some(old_content), true) => { - self.tline - .aux_file_size_estimator - .on_remove(old_content.len()); - // not adding the file key to the final `new_files` vec. - } - (None, false) => { - self.tline.aux_file_size_estimator.on_add(content.len()); - new_files.push((path, content)); - } - (None, true) => warn!("removing non-existing aux file: {}", path), - } - let new_val = aux_file::encode_file_value(&new_files)?; - self.put(key, Value::Image(new_val.into())); } - - if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy { - let file_path = path.to_string(); - let content = if content.is_empty() { - None - } else { - Some(Bytes::copy_from_slice(content)) - }; - - let n_files; - let mut aux_files = self.tline.aux_files.lock().await; - if let Some(mut dir) = aux_files.dir.take() { - // We already updated aux files in `self`: emit a delta and update our latest value. - dir.upsert(file_path.clone(), content.clone()); - n_files = dir.files.len(); - if aux_files.n_deltas == MAX_AUX_FILE_DELTAS { - self.put( - AUX_FILES_KEY, - Value::Image(Bytes::from( - AuxFilesDirectory::ser(&dir).context("serialize")?, - )), - ); - aux_files.n_deltas = 0; - } else { - self.put( - AUX_FILES_KEY, - Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }), - ); - aux_files.n_deltas += 1; - } - aux_files.dir = Some(dir); - } else { - // Check if the AUX_FILES_KEY is initialized - match self.get(AUX_FILES_KEY, ctx).await { - Ok(dir_bytes) => { - let mut dir = AuxFilesDirectory::des(&dir_bytes)?; - // Key is already set, we may append a delta - self.put( - AUX_FILES_KEY, - Value::WalRecord(NeonWalRecord::AuxFile { - file_path: file_path.clone(), - content: content.clone(), - }), - ); - dir.upsert(file_path, content); - n_files = dir.files.len(); - aux_files.dir = Some(dir); - } - Err( - e @ (PageReconstructError::Cancelled - | PageReconstructError::AncestorLsnTimeout(_)), - ) => { - // Important that we do not interpret a shutdown error as "not found" and thereby - // reset the map. - return Err(e.into()); - } - // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but - // the original code assumes all other errors are missing keys. Therefore, we keep the code path - // the same for now, though in theory, we should only match the `MissingKey` variant. - Err( - e @ (PageReconstructError::Other(_) - | PageReconstructError::WalRedo(_) - | PageReconstructError::MissingKey(_)), - ) => { - // Key is missing, we must insert an image as the basis for subsequent deltas. - - if !matches!(e, PageReconstructError::MissingKey(_)) { - let e = utils::error::report_compact_sources(&e); - tracing::warn!("treating error as if it was a missing key: {}", e); - } - - let mut dir = AuxFilesDirectory { - files: HashMap::new(), - }; - dir.upsert(file_path, content); - self.put( - AUX_FILES_KEY, - Value::Image(Bytes::from( - AuxFilesDirectory::ser(&dir).context("serialize")?, - )), - ); - n_files = 1; - aux_files.dir = Some(dir); - } - } + let mut new_files = other_files; + match (modifying_file, content.is_empty()) { + (Some(old_content), false) => { + self.tline + .aux_file_size_estimator + .on_update(old_content.len(), content.len()); + new_files.push((path, content)); } - - self.pending_directory_entries - .push((DirectoryKind::AuxFiles, n_files)); + (Some(old_content), true) => { + self.tline + .aux_file_size_estimator + .on_remove(old_content.len()); + // not adding the file key to the final `new_files` vec. + } + (None, false) => { + self.tline.aux_file_size_estimator.on_add(content.len()); + new_files.push((path, content)); + } + (None, true) => warn!("removing non-existing aux file: {}", path), } + let new_val = aux_file::encode_file_value(&new_files)?; + self.put(key, Value::Image(new_val.into())); Ok(()) } @@ -2089,12 +1877,6 @@ impl<'a> DatadirModification<'a> { self.tline.get(key, lsn, ctx).await } - /// Only used during unit tests, force putting a key into the modification. - #[cfg(test)] - pub(crate) fn put_for_test(&mut self, key: Key, val: Value) { - self.put(key, val); - } - fn put(&mut self, key: Key, val: Value) { if Self::is_data_key(&key) { self.put_data(key.to_compact(), val) @@ -2212,21 +1994,6 @@ struct RelDirectory { rels: HashSet<(Oid, u8)>, } -#[derive(Debug, Serialize, Deserialize, Default, PartialEq)] -pub(crate) struct AuxFilesDirectory { - pub(crate) files: HashMap, -} - -impl AuxFilesDirectory { - pub(crate) fn upsert(&mut self, key: String, value: Option) { - if let Some(value) = value { - self.files.insert(key, value); - } else { - self.files.remove(&key); - } - } -} - #[derive(Debug, Serialize, Deserialize)] struct RelSizeEntry { nblocks: u32, diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index 5a6f6e5176..4e8be58d58 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -53,6 +53,22 @@ impl Statvfs { Statvfs::Mock(stat) => stat.block_size, } } + + /// Get the available and total bytes on the filesystem. + pub fn get_avail_total_bytes(&self) -> (u64, u64) { + // https://unix.stackexchange.com/a/703650 + let blocksize = if self.fragment_size() > 0 { + self.fragment_size() + } else { + self.block_size() + }; + + // use blocks_available (b_avail) since, pageserver runs as unprivileged user + let avail_bytes = self.blocks_available() * blocksize; + let total_bytes = self.blocks() * blocksize; + + (avail_bytes, total_bytes) + } } pub mod mock { @@ -74,7 +90,7 @@ pub mod mock { let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap(); // round it up to the nearest block multiple - let used_blocks = (used_bytes + (blocksize - 1)) / blocksize; + let used_blocks = used_bytes.div_ceil(*blocksize); if used_blocks > *total_blocks { panic!( diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 689982ddd4..7a3305797c 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -16,11 +16,11 @@ use anyhow::{bail, Context}; use arc_swap::ArcSwap; use camino::Utf8Path; use camino::Utf8PathBuf; +use chrono::NaiveDateTime; use enumset::EnumSet; use futures::stream::FuturesUnordered; use futures::StreamExt; use pageserver_api::models; -use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::LsnLease; use pageserver_api::models::TimelineArchivalState; use pageserver_api::models::TimelineState; @@ -32,6 +32,10 @@ use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use remote_storage::TimeoutOrCancel; +use remote_timeline_client::manifest::{ + OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION, +}; +use remote_timeline_client::UploadQueueNotReadyError; use std::collections::BTreeMap; use std::fmt; use std::future::Future; @@ -66,13 +70,14 @@ use self::config::TenantConf; use self::metadata::TimelineMetadata; use self::mgr::GetActiveTenantError; use self::mgr::GetTenantError; -use self::remote_timeline_client::upload::upload_index_part; +use self::remote_timeline_client::upload::{upload_index_part, upload_tenant_manifest}; use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; use self::timeline::uninit::TimelineCreateGuard; use self::timeline::uninit::TimelineExclusionError; use self::timeline::uninit::UninitializedTimeline; use self::timeline::EvictionTaskTenantState; use self::timeline::GcCutoffs; +use self::timeline::TimelineDeleteProgress; use self::timeline::TimelineResources; use self::timeline::WaitLsnError; use crate::config::PageServerConf; @@ -241,6 +246,7 @@ struct TimelinePreload { } pub(crate) struct TenantPreload { + tenant_manifest: TenantManifest, timelines: HashMap, } @@ -489,6 +495,12 @@ impl WalRedoManager { } } +/// A very lightweight memory representation of an offloaded timeline. +/// +/// We need to store the list of offloaded timelines so that we can perform operations on them, +/// like unoffloading them, or (at a later date), decide to perform flattening. +/// This type has a much smaller memory impact than [`Timeline`], and thus we can store many +/// more offloaded timelines than we can manage ones that aren't. pub struct OffloadedTimeline { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, @@ -496,27 +508,78 @@ pub struct OffloadedTimeline { /// Whether to retain the branch lsn at the ancestor or not pub ancestor_retain_lsn: Option, - // TODO: once we persist offloaded state, make this lazily constructed - pub remote_client: Arc, + /// When the timeline was archived. + /// + /// Present for future flattening deliberations. + pub archived_at: NaiveDateTime, + + /// Lazily constructed remote client for the timeline + /// + /// If we offload a timeline, we keep around the remote client + /// for the duration of the process. If we find it through the + /// manifest, we don't construct it up until it's needed (deletion). + pub remote_client: Option>, /// Prevent two tasks from deleting the timeline at the same time. If held, the /// timeline is being deleted. If 'true', the timeline has already been deleted. - pub delete_progress: Arc>, + pub delete_progress: TimelineDeleteProgress, } impl OffloadedTimeline { - fn from_timeline(timeline: &Timeline) -> Self { + /// Obtains an offloaded timeline from a given timeline object. + /// + /// Returns `None` if the `archived_at` flag couldn't be obtained, i.e. + /// the timeline is not in a stopped state. + /// Panics if the timeline is not archived. + fn from_timeline(timeline: &Timeline) -> Result { let ancestor_retain_lsn = timeline .get_ancestor_timeline_id() .map(|_timeline_id| timeline.get_ancestor_lsn()); - Self { + let archived_at = timeline + .remote_client + .archived_at_stopped_queue()? + .expect("must be called on an archived timeline"); + Ok(Self { tenant_shard_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, ancestor_timeline_id: timeline.get_ancestor_timeline_id(), ancestor_retain_lsn, + archived_at, - remote_client: timeline.remote_client.clone(), + remote_client: Some(timeline.remote_client.clone()), delete_progress: timeline.delete_progress.clone(), + }) + } + fn from_manifest(tenant_shard_id: TenantShardId, manifest: &OffloadedTimelineManifest) -> Self { + let OffloadedTimelineManifest { + timeline_id, + ancestor_timeline_id, + ancestor_retain_lsn, + archived_at, + } = *manifest; + Self { + tenant_shard_id, + timeline_id, + ancestor_timeline_id, + ancestor_retain_lsn, + archived_at, + remote_client: None, + delete_progress: TimelineDeleteProgress::default(), + } + } + fn manifest(&self) -> OffloadedTimelineManifest { + let Self { + timeline_id, + ancestor_timeline_id, + ancestor_retain_lsn, + archived_at, + .. + } = self; + OffloadedTimelineManifest { + timeline_id: *timeline_id, + ancestor_timeline_id: *ancestor_timeline_id, + ancestor_retain_lsn: *ancestor_retain_lsn, + archived_at: *archived_at, } } } @@ -552,10 +615,19 @@ impl TimelineOrOffloaded { TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress, } } - pub fn remote_client(&self) -> &Arc { + pub fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc { match self { - TimelineOrOffloaded::Timeline(timeline) => &timeline.remote_client, - TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.remote_client, + TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(), + TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() { + Some(remote_client) => remote_client, + None => { + let remote_client = tenant.build_timeline_client( + offloaded.timeline_id, + tenant.remote_storage.clone(), + ); + Arc::new(remote_client) + } + }, } } } @@ -800,7 +872,6 @@ impl Tenant { index_part: Option, metadata: TimelineMetadata, ancestor: Option>, - last_aux_file_policy: Option, _ctx: &RequestContext, ) -> anyhow::Result<()> { let tenant_id = self.tenant_shard_id; @@ -811,10 +882,6 @@ impl Tenant { ancestor.clone(), resources, CreateTimelineCause::Load, - // This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`, - // there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence. - // Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2. - last_aux_file_policy, )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -829,10 +896,6 @@ impl Tenant { if let Some(index_part) = index_part.as_ref() { timeline.remote_client.init_upload_queue(index_part)?; - - timeline - .last_aux_file_policy - .store(index_part.last_aux_file_policy()); } else { // No data on the remote storage, but we have local metadata file. We can end up // here with timeline_create being interrupted before finishing index part upload. @@ -1141,14 +1204,35 @@ impl Tenant { cancel.clone(), ) .await?; + let (offloaded_add, tenant_manifest) = + match remote_timeline_client::do_download_tenant_manifest( + remote_storage, + &self.tenant_shard_id, + &cancel, + ) + .await + { + Ok((tenant_manifest, _generation)) => ( + format!("{} offloaded", tenant_manifest.offloaded_timelines.len()), + tenant_manifest, + ), + Err(DownloadError::NotFound) => { + ("no manifest".to_string(), TenantManifest::empty()) + } + Err(e) => Err(e)?, + }; - info!("found {} timelines", remote_timeline_ids.len(),); + info!( + "found {} timelines, and {offloaded_add}", + remote_timeline_ids.len() + ); for k in other_keys { warn!("Unexpected non timeline key {k}"); } Ok(TenantPreload { + tenant_manifest, timelines: self .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel) .await?, @@ -1173,12 +1257,26 @@ impl Tenant { anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"); }; + let mut offloaded_timeline_ids = HashSet::new(); + let mut offloaded_timelines_list = Vec::new(); + for timeline_manifest in preload.tenant_manifest.offloaded_timelines.iter() { + let timeline_id = timeline_manifest.timeline_id; + let offloaded_timeline = + OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest); + offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline))); + offloaded_timeline_ids.insert(timeline_id); + } + let mut timelines_to_resume_deletions = vec![]; let mut remote_index_and_client = HashMap::new(); let mut timeline_ancestors = HashMap::new(); let mut existent_timelines = HashSet::new(); for (timeline_id, preload) in preload.timelines { + if offloaded_timeline_ids.remove(&timeline_id) { + // The timeline is offloaded, skip loading it. + continue; + } let index_part = match preload.index_part { Ok(i) => { debug!("remote index part exists for timeline {timeline_id}"); @@ -1282,6 +1380,43 @@ impl Tenant { .context("resume_deletion") .map_err(LoadLocalTimelineError::ResumeDeletion)?; } + // Complete deletions for offloaded timeline id's. + offloaded_timelines_list + .retain(|(offloaded_id, _offloaded)| { + // At this point, offloaded_timeline_ids has the list of all offloaded timelines + // without a prefix in S3, so they are inexistent. + // In the end, existence of a timeline is finally determined by the existence of an index-part.json in remote storage. + // If there is a dangling reference in another location, they need to be cleaned up. + let delete = offloaded_timeline_ids.contains(offloaded_id); + if delete { + tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found"); + } + !delete + }); + if !offloaded_timelines_list.is_empty() { + tracing::info!( + "Tenant has {} offloaded timelines", + offloaded_timelines_list.len() + ); + } + { + let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap(); + offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter()); + } + if !offloaded_timeline_ids.is_empty() { + let manifest = self.tenant_manifest(); + // TODO: generation support + let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; + upload_tenant_manifest( + &self.remote_storage, + &self.tenant_shard_id, + generation, + &manifest, + &self.cancel, + ) + .await + .map_err(TimelineArchivalError::Other)?; + } // The local filesystem contents are a cache of what's in the remote IndexPart; // IndexPart is the source of truth. @@ -1403,15 +1538,12 @@ impl Tenant { None }; - let last_aux_file_policy = index_part.last_aux_file_policy(); - self.timeline_init_and_sync( timeline_id, resources, Some(index_part), remote_metadata, ancestor, - last_aux_file_policy, ctx, ) .await @@ -1456,20 +1588,28 @@ impl Tenant { Ok(timeline_preloads) } - fn load_timeline_metadata( - self: &Arc, + fn build_timeline_client( + &self, timeline_id: TimelineId, remote_storage: GenericRemoteStorage, - cancel: CancellationToken, - ) -> impl Future { - let client = RemoteTimelineClient::new( + ) -> RemoteTimelineClient { + RemoteTimelineClient::new( remote_storage.clone(), self.deletion_queue_client.clone(), self.conf, self.tenant_shard_id, timeline_id, self.generation, - ); + ) + } + + fn load_timeline_metadata( + self: &Arc, + timeline_id: TimelineId, + remote_storage: GenericRemoteStorage, + cancel: CancellationToken, + ) -> impl Future { + let client = self.build_timeline_client(timeline_id, remote_storage); async move { debug_assert_current_span_has_tenant_and_timeline_id(); debug!("starting index part download"); @@ -1560,7 +1700,7 @@ impl Tenant { info!("unoffloading timeline"); let cancel = self.cancel.clone(); let timeline_preload = self - .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel) + .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone()) .await; let index_part = match timeline_preload.index_part { @@ -1605,17 +1745,37 @@ impl Tenant { ) }) .map_err(TimelineArchivalError::Other)?; - let timelines = self.timelines.lock().unwrap(); - let Some(timeline) = timelines.get(&timeline_id) else { - warn!("timeline not available directly after attach"); - return Err(TimelineArchivalError::Other(anyhow::anyhow!( - "timeline not available directly after attach" - ))); + + let timeline = { + let timelines = self.timelines.lock().unwrap(); + let Some(timeline) = timelines.get(&timeline_id) else { + warn!("timeline not available directly after attach"); + // This is not a panic because no locks are held between `load_remote_timeline` + // which puts the timeline into timelines, and our look into the timeline map. + return Err(TimelineArchivalError::Other(anyhow::anyhow!( + "timeline not available directly after attach" + ))); + }; + let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); + if offloaded_timelines.remove(&timeline_id).is_none() { + warn!("timeline already removed from offloaded timelines"); + } + Arc::clone(timeline) }; - let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); - if offloaded_timelines.remove(&timeline_id).is_none() { - warn!("timeline already removed from offloaded timelines"); - } + + // Upload new list of offloaded timelines to S3 + let manifest = self.tenant_manifest(); + // TODO: generation support + let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; + upload_tenant_manifest( + &self.remote_storage, + &self.tenant_shard_id, + generation, + &manifest, + &cancel, + ) + .await + .map_err(TimelineArchivalError::Other)?; // Activate the timeline (if it makes sense) if !(timeline.is_broken() || timeline.is_stopping()) { @@ -1629,7 +1789,7 @@ impl Tenant { } info!("timeline unoffloading complete"); - Ok(Arc::clone(timeline)) + Ok(timeline) } pub(crate) async fn apply_timeline_archival_config( @@ -1768,7 +1928,7 @@ impl Tenant { } /// Lists timelines the tenant contains. - /// Up to tenant's implementation to omit certain timelines that ar not considered ready for use. + /// It's up to callers to omit certain timelines that are not considered ready for use. pub fn list_timelines(&self) -> Vec> { self.timelines .lock() @@ -1778,6 +1938,29 @@ impl Tenant { .collect() } + /// Lists timelines the tenant manages, including offloaded ones. + /// + /// It's up to callers to omit certain timelines that are not considered ready for use. + pub fn list_timelines_and_offloaded( + &self, + ) -> (Vec>, Vec>) { + let timelines = self + .timelines + .lock() + .unwrap() + .values() + .map(Arc::clone) + .collect(); + let offloaded = self + .timelines_offloaded + .lock() + .unwrap() + .values() + .map(Arc::clone) + .collect(); + (timelines, offloaded) + } + pub fn list_timeline_ids(&self) -> Vec { self.timelines.lock().unwrap().keys().cloned().collect() } @@ -1824,7 +2007,6 @@ impl Tenant { create_guard, initdb_lsn, None, - None, ) .await } @@ -2187,7 +2369,8 @@ impl Tenant { .iter() .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id)) }; - let can_offload = can_offload && has_no_unoffloaded_children; + let can_offload = + can_offload && has_no_unoffloaded_children && self.conf.timeline_offloading; if (is_active, can_offload) == (false, false) { None } else { @@ -2783,6 +2966,26 @@ impl Tenant { } } + // TODO: also copy index files of offloaded timelines + + let tenant_manifest = self.tenant_manifest(); + // TODO: generation support + let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; + for child_shard in child_shards { + tracing::info!( + "Uploading tenant manifest for child {}", + child_shard.to_index() + ); + upload_tenant_manifest( + &self.remote_storage, + child_shard, + generation, + &tenant_manifest, + &self.cancel, + ) + .await?; + } + Ok(()) } @@ -2960,6 +3163,22 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) } + pub(crate) fn tenant_manifest(&self) -> TenantManifest { + let timelines_offloaded = self.timelines_offloaded.lock().unwrap(); + + let mut timeline_manifests = timelines_offloaded + .iter() + .map(|(_timeline_id, offloaded)| offloaded.manifest()) + .collect::>(); + // Sort the manifests so that our output is deterministic + timeline_manifests.sort_by_key(|timeline_manifest| timeline_manifest.timeline_id); + + TenantManifest { + version: LATEST_TENANT_MANIFEST_VERSION, + offloaded_timelines: timeline_manifests, + } + } + pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { // Use read-copy-update in order to avoid overwriting the location config // state if this races with [`Tenant::set_new_location_config`]. Note that @@ -3031,7 +3250,6 @@ impl Tenant { ancestor: Option>, resources: TimelineResources, cause: CreateTimelineCause, - last_aux_file_policy: Option, ) -> anyhow::Result> { let state = match cause { CreateTimelineCause::Load => { @@ -3060,7 +3278,6 @@ impl Tenant { resources, pg_version, state, - last_aux_file_policy, self.attach_wal_lag_cooldown.clone(), self.cancel.child_token(), ); @@ -3719,7 +3936,6 @@ impl Tenant { timeline_create_guard, start_lsn + 1, Some(Arc::clone(src_timeline)), - src_timeline.last_aux_file_policy.load(), ) .await?; @@ -3913,7 +4129,6 @@ impl Tenant { timeline_create_guard, pgdata_lsn, None, - None, ) .await?; @@ -3956,18 +4171,21 @@ impl Tenant { Ok(timeline) } - /// Call this before constructing a timeline, to build its required structures - fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources { - let remote_client = RemoteTimelineClient::new( + fn build_timeline_remote_client(&self, timeline_id: TimelineId) -> RemoteTimelineClient { + RemoteTimelineClient::new( self.remote_storage.clone(), self.deletion_queue_client.clone(), self.conf, self.tenant_shard_id, timeline_id, self.generation, - ); + ) + } + + /// Call this before constructing a timeline, to build its required structures + fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources { TimelineResources { - remote_client, + remote_client: self.build_timeline_remote_client(timeline_id), timeline_get_throttle: self.timeline_get_throttle.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), } @@ -3985,7 +4203,6 @@ impl Tenant { create_guard: TimelineCreateGuard<'a>, start_lsn: Lsn, ancestor: Option>, - last_aux_file_policy: Option, ) -> anyhow::Result> { let tenant_shard_id = self.tenant_shard_id; @@ -4001,7 +4218,6 @@ impl Tenant { ancestor, resources, CreateTimelineCause::Load, - last_aux_file_policy, ) .context("Failed to create timeline data structure")?; @@ -4599,7 +4815,6 @@ mod tests { use super::*; use crate::keyspace::KeySpaceAccum; - use crate::pgdatadir_mapping::AuxFilesDirectory; use crate::repository::{Key, Value}; use crate::tenant::harness::*; use crate::tenant::timeline::CompactFlags; @@ -4608,7 +4823,7 @@ mod tests { use bytes::{Bytes, BytesMut}; use hex_literal::hex; use itertools::Itertools; - use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE}; + use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; use rand::{thread_rng, Rng}; @@ -4617,7 +4832,6 @@ mod tests { use tests::timeline::{GetVectoredError, ShutdownMode}; use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; use timeline::{DeltaLayerTestDesc, GcInfo}; - use utils::bin_ser::BeSer; use utils::id::TenantId; static TEST_KEY: Lazy = @@ -6421,16 +6635,9 @@ mod tests { } #[tokio::test] - async fn test_branch_copies_dirty_aux_file_flag() { - let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag") - .await - .unwrap(); + async fn test_aux_file_e2e() { + let harness = TenantHarness::create("test_aux_file_e2e").await.unwrap(); - // the default aux file policy to switch is v2 if not set by the admins - assert_eq!( - harness.tenant_conf.switch_aux_file_policy, - AuxFilePolicy::default_tenant_config() - ); let (tenant, ctx) = harness.load().await; let mut lsn = Lsn(0x08); @@ -6440,9 +6647,6 @@ mod tests { .await .unwrap(); - // no aux file is written at this point, so the persistent flag should be unset - assert_eq!(tline.last_aux_file_policy.load(), None); - { lsn += 8; let mut modification = tline.begin_modification(lsn); @@ -6453,30 +6657,6 @@ mod tests { modification.commit(&ctx).await.unwrap(); } - // there is no tenant manager to pass the configuration through, so lets mimic it - tenant.set_new_location_config( - AttachedTenantConf::try_from(LocationConf::attached_single( - TenantConfOpt { - switch_aux_file_policy: Some(AuxFilePolicy::V2), - ..Default::default() - }, - tenant.generation, - &pageserver_api::models::ShardParameters::default(), - )) - .unwrap(), - ); - - assert_eq!( - tline.get_switch_aux_file_policy(), - AuxFilePolicy::V2, - "wanted state has been updated" - ); - assert_eq!( - tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::V2), - "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there" - ); - // we can read everything from the storage let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); assert_eq!( @@ -6494,12 +6674,6 @@ mod tests { modification.commit(&ctx).await.unwrap(); } - assert_eq!( - tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::V2), - "keep v2 storage format when new files are written" - ); - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); assert_eq!( files.get("pg_logical/mappings/test2"), @@ -6511,321 +6685,9 @@ mod tests { .await .unwrap(); - // child copies the last flag even if that is not on remote storage yet - assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2); - assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2)); - let files = child.list_aux_files(lsn, &ctx).await.unwrap(); assert_eq!(files.get("pg_logical/mappings/test1"), None); assert_eq!(files.get("pg_logical/mappings/test2"), None); - - // even if we crash here without flushing parent timeline with it's new - // last_aux_file_policy we are safe, because child was never meant to access ancestor's - // files. the ancestor can even switch back to V1 because of a migration safely. - } - - #[tokio::test] - async fn aux_file_policy_switch() { - let mut harness = TenantHarness::create("aux_file_policy_switch") - .await - .unwrap(); - harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode - let (tenant, ctx) = harness.load().await; - - let mut lsn = Lsn(0x08); - - let tline: Arc = tenant - .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) - .await - .unwrap(); - - assert_eq!( - tline.last_aux_file_policy.load(), - None, - "no aux file is written so it should be unset" - ); - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - modification - .put_file("pg_logical/mappings/test1", b"first", &ctx) - .await - .unwrap(); - modification.commit(&ctx).await.unwrap(); - } - - // there is no tenant manager to pass the configuration through, so lets mimic it - tenant.set_new_location_config( - AttachedTenantConf::try_from(LocationConf::attached_single( - TenantConfOpt { - switch_aux_file_policy: Some(AuxFilePolicy::V2), - ..Default::default() - }, - tenant.generation, - &pageserver_api::models::ShardParameters::default(), - )) - .unwrap(), - ); - - assert_eq!( - tline.get_switch_aux_file_policy(), - AuxFilePolicy::V2, - "wanted state has been updated" - ); - assert_eq!( - tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::CrossValidation), - "dirty index_part.json reflected state is yet to be updated" - ); - - // we can still read the auxfile v1 before we ingest anything new - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); - assert_eq!( - files.get("pg_logical/mappings/test1"), - Some(&bytes::Bytes::from_static(b"first")) - ); - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - modification - .put_file("pg_logical/mappings/test2", b"second", &ctx) - .await - .unwrap(); - modification.commit(&ctx).await.unwrap(); - } - - assert_eq!( - tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::V2), - "ingesting a file should apply the wanted switch state when applicable" - ); - - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); - assert_eq!( - files.get("pg_logical/mappings/test1"), - Some(&bytes::Bytes::from_static(b"first")), - "cross validation writes to both v1 and v2 so this should be available in v2" - ); - assert_eq!( - files.get("pg_logical/mappings/test2"), - Some(&bytes::Bytes::from_static(b"second")) - ); - - // mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file) - tenant.set_new_location_config( - AttachedTenantConf::try_from(LocationConf::attached_single( - TenantConfOpt { - switch_aux_file_policy: Some(AuxFilePolicy::V1), - ..Default::default() - }, - tenant.generation, - &pageserver_api::models::ShardParameters::default(), - )) - .unwrap(), - ); - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - modification - .put_file("pg_logical/mappings/test2", b"third", &ctx) - .await - .unwrap(); - modification.commit(&ctx).await.unwrap(); - } - - assert_eq!( - tline.get_switch_aux_file_policy(), - AuxFilePolicy::V1, - "wanted state has been updated again, even if invalid request" - ); - - assert_eq!( - tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::V2), - "ingesting a file should apply the wanted switch state when applicable" - ); - - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); - assert_eq!( - files.get("pg_logical/mappings/test1"), - Some(&bytes::Bytes::from_static(b"first")) - ); - assert_eq!( - files.get("pg_logical/mappings/test2"), - Some(&bytes::Bytes::from_static(b"third")) - ); - - // mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file) - tenant.set_new_location_config( - AttachedTenantConf::try_from(LocationConf::attached_single( - TenantConfOpt { - switch_aux_file_policy: Some(AuxFilePolicy::V2), - ..Default::default() - }, - tenant.generation, - &pageserver_api::models::ShardParameters::default(), - )) - .unwrap(), - ); - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - modification - .put_file("pg_logical/mappings/test3", b"last", &ctx) - .await - .unwrap(); - modification.commit(&ctx).await.unwrap(); - } - - assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2); - - assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2)); - - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); - assert_eq!( - files.get("pg_logical/mappings/test1"), - Some(&bytes::Bytes::from_static(b"first")) - ); - assert_eq!( - files.get("pg_logical/mappings/test2"), - Some(&bytes::Bytes::from_static(b"third")) - ); - assert_eq!( - files.get("pg_logical/mappings/test3"), - Some(&bytes::Bytes::from_static(b"last")) - ); - } - - #[tokio::test] - async fn aux_file_policy_force_switch() { - let mut harness = TenantHarness::create("aux_file_policy_force_switch") - .await - .unwrap(); - harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1; - let (tenant, ctx) = harness.load().await; - - let mut lsn = Lsn(0x08); - - let tline: Arc = tenant - .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) - .await - .unwrap(); - - assert_eq!( - tline.last_aux_file_policy.load(), - None, - "no aux file is written so it should be unset" - ); - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - modification - .put_file("pg_logical/mappings/test1", b"first", &ctx) - .await - .unwrap(); - modification.commit(&ctx).await.unwrap(); - } - - tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap(); - - assert_eq!( - tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::V2), - "dirty index_part.json reflected state is yet to be updated" - ); - - // lose all data from v1 - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); - assert_eq!(files.get("pg_logical/mappings/test1"), None); - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - modification - .put_file("pg_logical/mappings/test2", b"second", &ctx) - .await - .unwrap(); - modification.commit(&ctx).await.unwrap(); - } - - // read data ingested in v2 - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); - assert_eq!( - files.get("pg_logical/mappings/test2"), - Some(&bytes::Bytes::from_static(b"second")) - ); - // lose all data from v1 - assert_eq!(files.get("pg_logical/mappings/test1"), None); - } - - #[tokio::test] - async fn aux_file_policy_auto_detect() { - let mut harness = TenantHarness::create("aux_file_policy_auto_detect") - .await - .unwrap(); - harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode - let (tenant, ctx) = harness.load().await; - - let mut lsn = Lsn(0x08); - - let tline: Arc = tenant - .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) - .await - .unwrap(); - - assert_eq!( - tline.last_aux_file_policy.load(), - None, - "no aux file is written so it should be unset" - ); - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { - files: vec![( - "test_file".to_string(), - Bytes::copy_from_slice(b"test_file"), - )] - .into_iter() - .collect(), - }) - .unwrap(); - modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); - modification.commit(&ctx).await.unwrap(); - } - - { - lsn += 8; - let mut modification = tline.begin_modification(lsn); - modification - .put_file("pg_logical/mappings/test1", b"first", &ctx) - .await - .unwrap(); - modification.commit(&ctx).await.unwrap(); - } - - assert_eq!( - tline.last_aux_file_policy.load(), - Some(AuxFilePolicy::V1), - "keep using v1 because there are aux files writting with v1" - ); - - // we can still read the auxfile v1 - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); - assert_eq!( - files.get("pg_logical/mappings/test1"), - Some(&bytes::Bytes::from_static(b"first")) - ); - assert_eq!( - files.get("test_file"), - Some(&bytes::Bytes::from_static(b"test_file")) - ); } #[tokio::test] diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 3afa3a86b9..2bd7f2d619 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -5,6 +5,8 @@ use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::context::RequestContext; use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; +#[cfg(test)] +use crate::virtual_file::IoBufferMut; use crate::virtual_file::VirtualFile; use bytes::Bytes; use std::ops::Deref; @@ -40,7 +42,7 @@ pub enum BlockLease<'a> { #[cfg(test)] Arc(std::sync::Arc<[u8; PAGE_SZ]>), #[cfg(test)] - Vec(Vec), + IoBufferMut(IoBufferMut), } impl From> for BlockLease<'static> { @@ -50,13 +52,13 @@ impl From> for BlockLease<'static> { } #[cfg(test)] -impl<'a> From> for BlockLease<'a> { +impl From> for BlockLease<'_> { fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self { BlockLease::Arc(value) } } -impl<'a> Deref for BlockLease<'a> { +impl Deref for BlockLease<'_> { type Target = [u8; PAGE_SZ]; fn deref(&self) -> &Self::Target { @@ -67,7 +69,7 @@ impl<'a> Deref for BlockLease<'a> { #[cfg(test)] BlockLease::Arc(v) => v.deref(), #[cfg(test)] - BlockLease::Vec(v) => { + BlockLease::IoBufferMut(v) => { TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ") } } diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 0107b0ac7e..b302cbc975 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -131,7 +131,7 @@ struct OnDiskNode<'a, const L: usize> { values: &'a [u8], } -impl<'a, const L: usize> OnDiskNode<'a, L> { +impl OnDiskNode<'_, L> { /// /// Interpret a PAGE_SZ page as a node. /// diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index a62a47f9a7..de0abab4c0 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -6,10 +6,11 @@ use crate::config::PageServerConf; use crate::context::RequestContext; use crate::page_cache; use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; +use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; use crate::virtual_file::owned_buffers_io::util::size_tracking_writer; use crate::virtual_file::owned_buffers_io::write::Buffer; -use crate::virtual_file::{self, owned_buffers_io, VirtualFile}; +use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile}; use bytes::BytesMut; use camino::Utf8PathBuf; use num_traits::Num; @@ -107,15 +108,18 @@ impl EphemeralFile { self.page_cache_file_id } - pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { + pub(crate) async fn load_to_io_buf( + &self, + ctx: &RequestContext, + ) -> Result { let size = self.len().into_usize(); - let vec = Vec::with_capacity(size); - let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?; + let buf = IoBufferMut::with_capacity(size); + let (slice, nread) = self.read_exact_at_eof_ok(0, buf.slice_full(), ctx).await?; assert_eq!(nread, size); - let vec = slice.into_inner(); - assert_eq!(vec.len(), nread); - assert_eq!(vec.capacity(), size, "we shouldn't be reallocating"); - Ok(vec) + let buf = slice.into_inner(); + assert_eq!(buf.len(), nread); + assert_eq!(buf.capacity(), size, "we shouldn't be reallocating"); + Ok(buf) } /// Returns the offset at which the first byte of the input was written, for use @@ -158,7 +162,7 @@ impl EphemeralFile { } impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile { - async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>( + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( &'b self, start: u64, dst: tokio_epoll_uring::Slice, @@ -345,7 +349,7 @@ mod tests { assert!(file.len() as usize == write_nbytes); for i in 0..write_nbytes { assert_eq!(value_offsets[i], i.into_u64()); - let buf = Vec::with_capacity(1); + let buf = IoBufferMut::with_capacity(1); let (buf_slice, nread) = file .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx) .await @@ -385,7 +389,7 @@ mod tests { // assert the state is as this test expects it to be assert_eq!( - &file.load_to_vec(&ctx).await.unwrap(), + &file.load_to_io_buf(&ctx).await.unwrap(), &content[0..cap + cap / 2] ); let md = file @@ -440,7 +444,7 @@ mod tests { let (buf, nread) = file .read_exact_at_eof_ok( start.into_u64(), - Vec::with_capacity(len).slice_full(), + IoBufferMut::with_capacity(len).slice_full(), ctx, ) .await diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 9d9852c525..0567f8f3a7 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -11,6 +11,7 @@ use pageserver_api::shard::{ }; use pageserver_api::upcall_api::ReAttachResponseTenant; use rand::{distributions::Alphanumeric, Rng}; +use remote_storage::TimeoutOrCancel; use std::borrow::Cow; use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap, HashSet}; @@ -1350,47 +1351,17 @@ impl TenantManager { } } - async fn delete_tenant_remote( - &self, - tenant_shard_id: TenantShardId, - ) -> Result<(), DeleteTenantError> { - let remote_path = remote_tenant_path(&tenant_shard_id); - let mut keys_stream = self.resources.remote_storage.list_streaming( - Some(&remote_path), - remote_storage::ListingMode::NoDelimiter, - None, - &self.cancel, - ); - while let Some(chunk) = keys_stream.next().await { - let keys = match chunk { - Ok(listing) => listing.keys, - Err(remote_storage::DownloadError::Cancelled) => { - return Err(DeleteTenantError::Cancelled) - } - Err(remote_storage::DownloadError::NotFound) => return Ok(()), - Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))), - }; - - if keys.is_empty() { - tracing::info!("Remote storage already deleted"); - } else { - tracing::info!("Deleting {} keys from remote storage", keys.len()); - let keys = keys.into_iter().map(|o| o.key).collect::>(); - self.resources - .remote_storage - .delete_objects(&keys, &self.cancel) - .await?; - } - } - - Ok(()) - } - /// If a tenant is attached, detach it. Then remove its data from remote storage. /// /// A tenant is considered deleted once it is gone from remote storage. It is the caller's /// responsibility to avoid trying to attach the tenant again or use it any way once deletion /// has started: this operation is not atomic, and must be retried until it succeeds. + /// + /// As a special case, if an unsharded tenant ID is given for a sharded tenant, it will remove + /// all tenant shards in remote storage (removing all paths with the tenant prefix). The storage + /// controller uses this to purge all remote tenant data, including any stale parent shards that + /// may remain after splits. Ideally, this special case would be handled elsewhere. See: + /// . pub(crate) async fn delete_tenant( &self, tenant_shard_id: TenantShardId, @@ -1442,25 +1413,29 @@ impl TenantManager { // in 500 responses to delete requests. // - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will // 503/retry, rather than kicking off a wasteful concurrent deletion. - match backoff::retry( - || async move { self.delete_tenant_remote(tenant_shard_id).await }, - |e| match e { - DeleteTenantError::Cancelled => true, - DeleteTenantError::SlotError(_) => { - unreachable!("Remote deletion doesn't touch slots") - } - _ => false, + // NB: this also deletes partial prefixes, i.e. a path will delete all + // _/* objects. See method comment for why. + backoff::retry( + || async move { + self.resources + .remote_storage + .delete_prefix(&remote_tenant_path(&tenant_shard_id), &self.cancel) + .await }, + |_| false, // backoff::retry handles cancellation 1, 3, &format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"), &self.cancel, ) .await - { - Some(r) => r, - None => Err(DeleteTenantError::Cancelled), - } + .unwrap_or(Err(TimeoutOrCancel::Cancel.into())) + .map_err(|err| { + if TimeoutOrCancel::caused_by_cancel(&err) { + return DeleteTenantError::Cancelled; + } + DeleteTenantError::Other(err) + }) } #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))] diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 1f9ae40af5..066fd12a9a 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -180,6 +180,7 @@ pub(crate) mod download; pub mod index; +pub mod manifest; pub(crate) mod upload; use anyhow::Context; @@ -187,11 +188,10 @@ use camino::Utf8Path; use chrono::{NaiveDateTime, Utc}; pub(crate) use download::download_initdb_tar_zst; -use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState}; +use pageserver_api::models::TimelineArchivalState; use pageserver_api::shard::{ShardIndex, TenantShardId}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; -pub(crate) use upload::upload_initdb_dir; use utils::backoff::{ self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, }; @@ -245,9 +245,11 @@ use super::upload_queue::{NotInitialized, SetDeletedFlagProgress}; use super::Generation; pub(crate) use download::{ - download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines, + do_download_tenant_manifest, download_index_part, is_temp_download_file, + list_remote_tenant_shards, list_remote_timelines, }; pub(crate) use index::LayerFileMetadata; +pub(crate) use upload::{upload_initdb_dir, upload_tenant_manifest}; // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a download fails, we log it at info-level, and retry. @@ -272,6 +274,12 @@ pub(crate) const BUFFER_SIZE: usize = 32 * 1024; /// which we warn and skip. const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10); +/// Hardcode a generation for the tenant manifest for now so that we don't +/// need to deal with generation-less manifests in the future. +/// +/// TODO: add proper generation support to all the places that use this. +pub(crate) const TENANT_MANIFEST_GENERATION: Generation = Generation::new(1); + pub enum MaybeDeletedIndexPart { IndexPart(IndexPart), Deleted(IndexPart), @@ -295,6 +303,10 @@ pub enum WaitCompletionError { UploadQueueShutDownOrStopped, } +#[derive(Debug, thiserror::Error)] +#[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")] +pub struct UploadQueueNotReadyError; + /// A client for accessing a timeline's data in remote storage. /// /// This takes care of managing the number of connections, and balancing them @@ -468,6 +480,20 @@ impl RemoteTimelineClient { .ok() } + /// Returns `Ok(Some(timestamp))` if the timeline has been archived, `Ok(None)` if the timeline hasn't been archived. + /// + /// Return Err(_) if the remote index_part hasn't been downloaded yet, or the timeline hasn't been stopped yet. + pub(crate) fn archived_at_stopped_queue( + &self, + ) -> Result, UploadQueueNotReadyError> { + self.upload_queue + .lock() + .unwrap() + .stopped_mut() + .map(|q| q.upload_queue_for_deletion.clean.0.archived_at) + .map_err(|_| UploadQueueNotReadyError) + } + fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) { let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part { current_remote_index_part @@ -505,7 +531,7 @@ impl RemoteTimelineClient { }, ); - let (index_part, _index_generation) = download::download_index_part( + let (index_part, index_generation, index_last_modified) = download::download_index_part( &self.storage_impl, &self.tenant_shard_id, &self.timeline_id, @@ -519,6 +545,49 @@ impl RemoteTimelineClient { ) .await?; + // Defense in depth: monotonicity of generation numbers is an important correctness guarantee, so when we see a very + // old index, we do extra checks in case this is the result of backward time-travel of the generation number (e.g. + // in case of a bug in the service that issues generation numbers). Indices are allowed to be old, but we expect that + // when we load an old index we are loading the _latest_ index: if we are asked to load an old index and there is + // also a newer index available, that is surprising. + const INDEX_AGE_CHECKS_THRESHOLD: Duration = Duration::from_secs(14 * 24 * 3600); + let index_age = index_last_modified.elapsed().unwrap_or_else(|e| { + if e.duration() > Duration::from_secs(5) { + // We only warn if the S3 clock and our local clock are >5s out: because this is a low resolution + // timestamp, it is common to be out by at least 1 second. + tracing::warn!("Index has modification time in the future: {e}"); + } + Duration::ZERO + }); + if index_age > INDEX_AGE_CHECKS_THRESHOLD { + tracing::info!( + ?index_generation, + age = index_age.as_secs_f64(), + "Loaded an old index, checking for other indices..." + ); + + // Find the highest-generation index + let (_latest_index_part, latest_index_generation, latest_index_mtime) = + download::download_index_part( + &self.storage_impl, + &self.tenant_shard_id, + &self.timeline_id, + Generation::MAX, + cancel, + ) + .await?; + + if latest_index_generation > index_generation { + // Unexpected! Why are we loading such an old index if a more recent one exists? + tracing::warn!( + ?index_generation, + ?latest_index_generation, + ?latest_index_mtime, + "Found a newer index while loading an old one" + ); + } + } + if index_part.deleted_at.is_some() { Ok(MaybeDeletedIndexPart::Deleted(index_part)) } else { @@ -628,18 +697,6 @@ impl RemoteTimelineClient { Ok(()) } - /// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated. - pub(crate) fn schedule_index_upload_for_aux_file_policy_update( - self: &Arc, - last_aux_file_policy: Option, - ) -> anyhow::Result<()> { - let mut guard = self.upload_queue.lock().unwrap(); - let upload_queue = guard.initialized_mut()?; - upload_queue.dirty.last_aux_file_policy = last_aux_file_policy; - self.schedule_index_upload(upload_queue)?; - Ok(()) - } - /// Launch an index-file upload operation in the background, with only the `archived_at` field updated. /// /// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded, @@ -2151,7 +2208,7 @@ pub(crate) struct UploadQueueAccessor<'a> { inner: std::sync::MutexGuard<'a, UploadQueue>, } -impl<'a> UploadQueueAccessor<'a> { +impl UploadQueueAccessor<'_> { pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart { match &*self.inner { UploadQueue::Initialized(x) => &x.clean.0, @@ -2167,6 +2224,17 @@ pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath { RemotePath::from_string(&path).expect("Failed to construct path") } +pub fn remote_tenant_manifest_path( + tenant_shard_id: &TenantShardId, + generation: Generation, +) -> RemotePath { + let path = format!( + "tenants/{tenant_shard_id}/tenant-manifest{}.json", + generation.get_suffix() + ); + RemotePath::from_string(&path).expect("Failed to construct path") +} + pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath { let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}"); RemotePath::from_string(&path).expect("Failed to construct path") diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 692e4d3096..95f8f026d4 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -6,6 +6,7 @@ use std::collections::HashSet; use std::future::Future; use std::str::FromStr; +use std::time::SystemTime; use anyhow::{anyhow, Context}; use camino::{Utf8Path, Utf8PathBuf}; @@ -33,10 +34,11 @@ use utils::id::{TenantId, TimelineId}; use utils::pausable_failpoint; use super::index::{IndexPart, LayerFileMetadata}; +use super::manifest::TenantManifest; use super::{ parse_remote_index_path, remote_index_path, remote_initdb_archive_path, - remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, INITDB_PATH, + remote_initdb_preserved_archive_path, remote_tenant_manifest_path, remote_tenant_path, + FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH, }; /// @@ -337,19 +339,15 @@ pub async fn list_remote_timelines( list_identifiers::(storage, remote_path, cancel).await } -async fn do_download_index_part( +async fn do_download_remote_path_retry_forever( storage: &GenericRemoteStorage, - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, - index_generation: Generation, + remote_path: &RemotePath, cancel: &CancellationToken, -) -> Result<(IndexPart, Generation), DownloadError> { - let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); - - let index_part_bytes = download_retry_forever( +) -> Result<(Vec, SystemTime), DownloadError> { + download_retry_forever( || async { let download = storage - .download(&remote_path, &DownloadOpts::default(), cancel) + .download(remote_path, &DownloadOpts::default(), cancel) .await?; let mut bytes = Vec::new(); @@ -359,18 +357,50 @@ async fn do_download_index_part( tokio::io::copy_buf(&mut stream, &mut bytes).await?; - Ok(bytes) + Ok((bytes, download.last_modified)) }, &format!("download {remote_path:?}"), cancel, ) - .await?; + .await +} + +pub async fn do_download_tenant_manifest( + storage: &GenericRemoteStorage, + tenant_shard_id: &TenantShardId, + cancel: &CancellationToken, +) -> Result<(TenantManifest, Generation), DownloadError> { + // TODO: generation support + let generation = super::TENANT_MANIFEST_GENERATION; + let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation); + + let (manifest_bytes, _manifest_bytes_mtime) = + do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?; + + let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes) + .with_context(|| format!("deserialize tenant manifest file at {remote_path:?}")) + .map_err(DownloadError::Other)?; + + Ok((tenant_manifest, generation)) +} + +async fn do_download_index_part( + storage: &GenericRemoteStorage, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + index_generation: Generation, + cancel: &CancellationToken, +) -> Result<(IndexPart, Generation, SystemTime), DownloadError> { + let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); + + let (index_part_bytes, index_part_mtime) = + do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?; let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) .with_context(|| format!("deserialize index part file at {remote_path:?}")) .map_err(DownloadError::Other)?; - Ok((index_part, index_generation)) + Ok((index_part, index_generation, index_part_mtime)) } /// index_part.json objects are suffixed with a generation number, so we cannot @@ -385,7 +415,7 @@ pub(crate) async fn download_index_part( timeline_id: &TimelineId, my_generation: Generation, cancel: &CancellationToken, -) -> Result<(IndexPart, Generation), DownloadError> { +) -> Result<(IndexPart, Generation, SystemTime), DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); if my_generation.is_none() { diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index c51ff54919..d8a881a2c4 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -121,11 +121,11 @@ impl IndexPart { self.disk_consistent_lsn } - pub fn from_s3_bytes(bytes: &[u8]) -> Result { + pub fn from_json_bytes(bytes: &[u8]) -> Result { serde_json::from_slice::(bytes) } - pub fn to_s3_bytes(&self) -> serde_json::Result> { + pub fn to_json_bytes(&self) -> serde_json::Result> { serde_json::to_vec(self) } @@ -133,10 +133,6 @@ impl IndexPart { pub(crate) fn example() -> Self { Self::empty(TimelineMetadata::example()) } - - pub(crate) fn last_aux_file_policy(&self) -> Option { - self.last_aux_file_policy - } } /// Metadata gathered for each of the layer files. @@ -387,7 +383,7 @@ mod tests { last_aux_file_policy: None, }; - let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } @@ -431,7 +427,7 @@ mod tests { last_aux_file_policy: None, }; - let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } @@ -476,7 +472,7 @@ mod tests { last_aux_file_policy: None, }; - let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } @@ -524,7 +520,7 @@ mod tests { last_aux_file_policy: None, }; - let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap(); + let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap(); assert_eq!(empty_layers_parsed, expected); } @@ -567,7 +563,7 @@ mod tests { last_aux_file_policy: None, }; - let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } @@ -613,7 +609,7 @@ mod tests { last_aux_file_policy: None, }; - let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } @@ -664,7 +660,7 @@ mod tests { last_aux_file_policy: Some(AuxFilePolicy::V2), }; - let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } @@ -720,7 +716,7 @@ mod tests { last_aux_file_policy: Default::default(), }; - let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } @@ -777,7 +773,7 @@ mod tests { last_aux_file_policy: Default::default(), }; - let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } @@ -839,7 +835,7 @@ mod tests { archived_at: None, }; - let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs new file mode 100644 index 0000000000..7d92d45146 --- /dev/null +++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs @@ -0,0 +1,53 @@ +use chrono::NaiveDateTime; +use serde::{Deserialize, Serialize}; +use utils::{id::TimelineId, lsn::Lsn}; + +/// Tenant-shard scoped manifest +#[derive(Clone, Serialize, Deserialize)] +pub struct TenantManifest { + /// Debugging aid describing the version of this manifest. + /// Can also be used for distinguishing breaking changes later on. + pub version: usize, + + /// The list of offloaded timelines together with enough information + /// to not have to actually load them. + /// + /// Note: the timelines mentioned in this list might be deleted, i.e. + /// we don't hold an invariant that the references aren't dangling. + /// Existence of index-part.json is the actual indicator of timeline existence. + pub offloaded_timelines: Vec, +} + +/// The remote level representation of an offloaded timeline. +/// +/// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`], +/// but the two datastructures serve different needs, this is for a persistent disk format +/// that must be backwards compatible, while the other is only for informative purposes. +#[derive(Clone, Serialize, Deserialize, Copy)] +pub struct OffloadedTimelineManifest { + pub timeline_id: TimelineId, + /// Whether the timeline has a parent it has been branched off from or not + pub ancestor_timeline_id: Option, + /// Whether to retain the branch lsn at the ancestor or not + pub ancestor_retain_lsn: Option, + /// The time point when the timeline was archived + pub archived_at: NaiveDateTime, +} + +pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1; + +impl TenantManifest { + pub(crate) fn empty() -> Self { + Self { + version: LATEST_TENANT_MANIFEST_VERSION, + offloaded_timelines: vec![], + } + } + pub(crate) fn from_json_bytes(bytes: &[u8]) -> Result { + serde_json::from_slice::(bytes) + } + + pub(crate) fn to_json_bytes(&self) -> serde_json::Result> { + serde_json::to_vec(self) + } +} diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index c4dd184610..0cd5d05aa2 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -13,9 +13,11 @@ use tokio_util::sync::CancellationToken; use utils::{backoff, pausable_failpoint}; use super::index::IndexPart; +use super::manifest::TenantManifest; use super::Generation; use crate::tenant::remote_timeline_client::{ remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path, + remote_tenant_manifest_path, }; use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; use utils::id::{TenantId, TimelineId}; @@ -39,7 +41,7 @@ pub(crate) async fn upload_index_part<'a>( pausable_failpoint!("before-upload-index-pausable"); // FIXME: this error comes too late - let serialized = index_part.to_s3_bytes()?; + let serialized = index_part.to_json_bytes()?; let serialized = Bytes::from(serialized); let index_part_size = serialized.len(); @@ -55,6 +57,37 @@ pub(crate) async fn upload_index_part<'a>( .await .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) } +/// Serializes and uploads the given tenant manifest data to the remote storage. +pub(crate) async fn upload_tenant_manifest( + storage: &GenericRemoteStorage, + tenant_shard_id: &TenantShardId, + generation: Generation, + tenant_manifest: &TenantManifest, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + tracing::trace!("uploading new tenant manifest"); + + fail_point!("before-upload-manifest", |_| { + bail!("failpoint before-upload-manifest") + }); + pausable_failpoint!("before-upload-manifest-pausable"); + + let serialized = tenant_manifest.to_json_bytes()?; + let serialized = Bytes::from(serialized); + + let tenant_manifest_site = serialized.len(); + + let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation); + storage + .upload_storage_object( + futures::stream::once(futures::future::ready(Ok(serialized))), + tenant_manifest_site, + &remote_path, + cancel, + ) + .await + .with_context(|| format!("upload tenant manifest for '{tenant_shard_id}'")) +} /// Attempts to upload given layer files. /// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload. diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index 0aad5bf392..e680fd705b 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -108,7 +108,6 @@ impl scheduler::Completion for WriteComplete { /// when we last did a write. We only populate this after doing at least one /// write for a tenant -- this avoids holding state for tenants that have /// uploads disabled. - struct UploaderTenantState { // This Weak only exists to enable culling idle instances of this type // when the Tenant has been deallocated. diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 99bd0ece57..a229b59560 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -705,7 +705,7 @@ pub mod tests { /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers. struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range); -impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> { +impl std::fmt::Debug for RangeDisplayDebug<'_, T> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}..{}", self.0.start, self.0.end) } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 8be7d7876f..ceae1d4b1a 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -44,11 +44,11 @@ use crate::tenant::vectored_blob_io::{ }; use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; +use crate::virtual_file::IoBufferMut; use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{anyhow, bail, ensure, Context, Result}; -use bytes::BytesMut; use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; @@ -515,8 +515,8 @@ impl DeltaLayerWriterInner { ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { let temp_path = self.path.clone(); let result = self.finish0(key_end, ctx).await; - if result.is_err() { - tracing::info!(%temp_path, "cleaning up temporary file after error during writing"); + if let Err(ref e) = result { + tracing::info!(%temp_path, "cleaning up temporary file after error during writing: {e}"); if let Err(e) = std::fs::remove_file(&temp_path) { tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing"); } @@ -529,8 +529,7 @@ impl DeltaLayerWriterInner { key_end: Key, ctx: &RequestContext, ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { - let index_start_blk = - ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; + let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32; let mut file = self.blob_writer.into_inner(ctx).await?; @@ -1003,7 +1002,7 @@ impl DeltaLayerInner { .0 .into(); let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes); - let mut buf = Some(BytesMut::with_capacity(buf_size)); + let mut buf = Some(IoBufferMut::with_capacity(buf_size)); // Note that reads are processed in reverse order (from highest key+lsn). // This is the order that `ReconstructState` requires such that it can @@ -1030,7 +1029,7 @@ impl DeltaLayerInner { // We have "lost" the buffer since the lower level IO api // doesn't return the buffer on error. Allocate a new one. - buf = Some(BytesMut::with_capacity(buf_size)); + buf = Some(IoBufferMut::with_capacity(buf_size)); continue; } @@ -1204,7 +1203,7 @@ impl DeltaLayerInner { .map(|x| x.0.get()) .unwrap_or(8192); - let mut buffer = Some(BytesMut::with_capacity(max_read_size)); + let mut buffer = Some(IoBufferMut::with_capacity(max_read_size)); // FIXME: buffering of DeltaLayerWriter let mut per_blob_copy = Vec::new(); @@ -1562,12 +1561,11 @@ impl<'a> DeltaLayerIterator<'a> { let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file); let mut next_batch = std::collections::VecDeque::new(); let buf_size = plan.size(); - let buf = BytesMut::with_capacity(buf_size); + let buf = IoBufferMut::with_capacity(buf_size); let blobs_buf = vectored_blob_reader .read_blobs(&plan, buf, self.ctx) .await?; - let frozen_buf = blobs_buf.buf.freeze(); - let view = BufView::new_bytes(frozen_buf); + let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter() { let blob_read = meta.read(&view).await?; let value = Value::des(&blob_read)?; @@ -1942,7 +1940,7 @@ pub(crate) mod test { &vectored_reads, constants::MAX_VECTORED_READ_BYTES, ); - let mut buf = Some(BytesMut::with_capacity(buf_size)); + let mut buf = Some(IoBufferMut::with_capacity(buf_size)); for read in vectored_reads { let blobs_buf = vectored_blob_reader diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index de8155f455..fa058833d4 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -41,10 +41,11 @@ use crate::tenant::vectored_blob_io::{ }; use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; +use crate::virtual_file::IoBufferMut; use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{anyhow, bail, ensure, Context, Result}; -use bytes::{Bytes, BytesMut}; +use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; use hex; use itertools::Itertools; @@ -547,10 +548,10 @@ impl ImageLayerInner { for read in plan.into_iter() { let buf_size = read.size(); - let buf = BytesMut::with_capacity(buf_size); + let buf = IoBufferMut::with_capacity(buf_size); let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?; - let frozen_buf = blobs_buf.buf.freeze(); - let view = BufView::new_bytes(frozen_buf); + + let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter() { let img_buf = meta.read(&view).await?; @@ -609,13 +610,12 @@ impl ImageLayerInner { } } - let buf = BytesMut::with_capacity(buf_size); + let buf = IoBufferMut::with_capacity(buf_size); let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await; match res { Ok(blobs_buf) => { - let frozen_buf = blobs_buf.buf.freeze(); - let view = BufView::new_bytes(frozen_buf); + let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter() { let img_buf = meta.read(&view).await; @@ -828,8 +828,26 @@ impl ImageLayerWriterInner { ctx: &RequestContext, end_key: Option, ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { - let index_start_blk = - ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; + let temp_path = self.path.clone(); + let result = self.finish0(ctx, end_key).await; + if let Err(ref e) = result { + tracing::info!(%temp_path, "cleaning up temporary file after error during writing: {e}"); + if let Err(e) = std::fs::remove_file(&temp_path) { + tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing"); + } + } + result + } + + /// + /// Finish writing the image layer. + /// + async fn finish0( + self, + ctx: &RequestContext, + end_key: Option, + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { + let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32; // Calculate compression ratio let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header @@ -1051,12 +1069,11 @@ impl<'a> ImageLayerIterator<'a> { let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file); let mut next_batch = std::collections::VecDeque::new(); let buf_size = plan.size(); - let buf = BytesMut::with_capacity(buf_size); + let buf = IoBufferMut::with_capacity(buf_size); let blobs_buf = vectored_blob_reader .read_blobs(&plan, buf, self.ctx) .await?; - let frozen_buf = blobs_buf.buf.freeze(); - let view = BufView::new_bytes(frozen_buf); + let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter() { let img_buf = meta.read(&view).await?; next_batch.push_back(( diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index e487bee1f2..7573ddb5cc 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -14,7 +14,6 @@ use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::{l0_flush, page_cache}; use anyhow::{anyhow, Context, Result}; -use bytes::Bytes; use camino::Utf8PathBuf; use pageserver_api::key::CompactKey; use pageserver_api::keyspace::KeySpace; @@ -809,9 +808,8 @@ impl InMemoryLayer { match l0_flush_global_state { l0_flush::Inner::Direct { .. } => { - let file_contents: Vec = inner.file.load_to_vec(ctx).await?; - - let file_contents = Bytes::from(file_contents); + let file_contents = inner.file.load_to_io_buf(ctx).await?; + let file_contents = file_contents.freeze(); for (key, vec_map) in inner.index.iter() { // Write all page versions @@ -825,7 +823,7 @@ impl InMemoryLayer { len, will_init, } = entry; - let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize); + let buf = file_contents.slice(pos as usize..(pos + len) as usize); let (_buf, res) = delta_layer_writer .put_value_bytes( Key::from_compact(*key), diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs index 0683e15659..a4bb3a6bfc 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -9,6 +9,7 @@ use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice}; use crate::{ assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}, context::RequestContext, + virtual_file::{owned_buffers_io::io_buf_aligned::IoBufAlignedMut, IoBufferMut}, }; /// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`]. @@ -24,7 +25,7 @@ pub trait File: Send { /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`. /// /// No guarantees are made about the remaining bytes in `dst` in case of a short read. - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( &'b self, start: u64, dst: Slice, @@ -227,7 +228,7 @@ where // Execute physical reads and fill the logical read buffers // TODO: pipelined reads; prefetch; - let get_io_buffer = |nchunks| Vec::with_capacity(nchunks * DIO_CHUNK_SIZE); + let get_io_buffer = |nchunks| IoBufferMut::with_capacity(nchunks * DIO_CHUNK_SIZE); for PhysicalRead { start_chunk_no, nchunks, @@ -459,7 +460,7 @@ mod tests { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let file = InMemoryFile::new_random(10); let test_read = |pos, len| { - let buf = vec![0; len]; + let buf = IoBufferMut::with_capacity_zeroed(len); let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx); use futures::FutureExt; let (slice, nread) = fut @@ -470,9 +471,9 @@ mod tests { buf.truncate(nread); buf }; - assert_eq!(test_read(0, 1), &file.content[0..1]); - assert_eq!(test_read(1, 2), &file.content[1..3]); - assert_eq!(test_read(9, 2), &file.content[9..]); + assert_eq!(&test_read(0, 1), &file.content[0..1]); + assert_eq!(&test_read(1, 2), &file.content[1..3]); + assert_eq!(&test_read(9, 2), &file.content[9..]); assert!(test_read(10, 2).is_empty()); assert!(test_read(11, 2).is_empty()); } @@ -609,7 +610,7 @@ mod tests { } impl<'x> File for RecorderFile<'x> { - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( &'b self, start: u64, dst: Slice, @@ -782,7 +783,7 @@ mod tests { 2048, 1024 => Err("foo".to_owned()), }; - let buf = Vec::with_capacity(512); + let buf = IoBufferMut::with_capacity(512); let (buf, nread) = mock_file .read_exact_at_eof_ok(0, buf.slice_full(), &ctx) .await @@ -790,7 +791,7 @@ mod tests { assert_eq!(nread, 512); assert_eq!(&buf.into_inner()[..nread], &[0; 512]); - let buf = Vec::with_capacity(512); + let buf = IoBufferMut::with_capacity(512); let (buf, nread) = mock_file .read_exact_at_eof_ok(512, buf.slice_full(), &ctx) .await @@ -798,7 +799,7 @@ mod tests { assert_eq!(nread, 512); assert_eq!(&buf.into_inner()[..nread], &[1; 512]); - let buf = Vec::with_capacity(512); + let buf = IoBufferMut::with_capacity(512); let (buf, nread) = mock_file .read_exact_at_eof_ok(1024, buf.slice_full(), &ctx) .await @@ -806,7 +807,7 @@ mod tests { assert_eq!(nread, 10); assert_eq!(&buf.into_inner()[..nread], &[2; 10]); - let buf = Vec::with_capacity(1024); + let buf = IoBufferMut::with_capacity(1024); let err = mock_file .read_exact_at_eof_ok(2048, buf.slice_full(), &ctx) .await diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index bbb21b180e..38a7cd09af 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -341,6 +341,10 @@ impl Layer { Ok(()) } + pub(crate) async fn needs_download(&self) -> Result, std::io::Error> { + self.0.needs_download().await + } + /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction /// while the guard exists. /// @@ -974,7 +978,7 @@ impl LayerInner { let timeline = self .timeline .upgrade() - .ok_or_else(|| DownloadError::TimelineShutdown)?; + .ok_or(DownloadError::TimelineShutdown)?; // count cancellations, which currently remain largely unexpected let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs index ffe7ca5f3e..8e750e1187 100644 --- a/pageserver/src/tenant/storage_layer/layer_name.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -339,7 +339,7 @@ impl<'de> serde::Deserialize<'de> for LayerName { struct LayerNameVisitor; -impl<'de> serde::de::Visitor<'de> for LayerNameVisitor { +impl serde::de::Visitor<'_> for LayerNameVisitor { type Value = LayerName; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index 0831fd9530..f91e27241d 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -99,21 +99,21 @@ impl<'a> PeekableLayerIterRef<'a> { } } -impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> { +impl std::cmp::PartialEq for IteratorWrapper<'_> { fn eq(&self, other: &Self) -> bool { self.cmp(other) == Ordering::Equal } } -impl<'a> std::cmp::Eq for IteratorWrapper<'a> {} +impl std::cmp::Eq for IteratorWrapper<'_> {} -impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> { +impl std::cmp::PartialOrd for IteratorWrapper<'_> { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl<'a> std::cmp::Ord for IteratorWrapper<'a> { +impl std::cmp::Ord for IteratorWrapper<'_> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { use std::cmp::Ordering; let a = self.peek_next_key_lsn_value(); diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs index b499a0eef4..45ac0c6668 100644 --- a/pageserver/src/tenant/storage_layer/split_writer.rs +++ b/pageserver/src/tenant/storage_layer/split_writer.rs @@ -42,7 +42,7 @@ impl SplitWriterResult { pub struct SplitImageLayerWriter { inner: ImageLayerWriter, target_layer_size: u64, - generated_layers: Vec, + generated_layer_writers: Vec<(ImageLayerWriter, PersistentLayerKey)>, conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, @@ -71,7 +71,7 @@ impl SplitImageLayerWriter { ctx, ) .await?, - generated_layers: Vec::new(), + generated_layer_writers: Vec::new(), conf, timeline_id, tenant_shard_id, @@ -80,18 +80,12 @@ impl SplitImageLayerWriter { }) } - pub async fn put_image_with_discard_fn( + pub async fn put_image( &mut self, key: Key, img: Bytes, - tline: &Arc, ctx: &RequestContext, - discard: D, - ) -> anyhow::Result<()> - where - D: FnOnce(&PersistentLayerKey) -> F, - F: Future, - { + ) -> anyhow::Result<()> { // The current estimation is an upper bound of the space that the key/image could take // because we did not consider compression in this estimation. The resulting image layer // could be smaller than the target size. @@ -108,72 +102,83 @@ impl SplitImageLayerWriter { ctx, ) .await?; - let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer); let layer_key = PersistentLayerKey { key_range: self.start_key..key, lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn), is_delta: false, }; + let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer); self.start_key = key; - if discard(&layer_key).await { - drop(prev_image_writer); - self.generated_layers - .push(SplitWriterResult::Discarded(layer_key)); - } else { - let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?; - - let layer = Layer::finish_creating(self.conf, tline, desc, &path)?; - self.generated_layers - .push(SplitWriterResult::Produced(layer)); - } + self.generated_layer_writers + .push((prev_image_writer, layer_key)); } self.inner.put_image(key, img, ctx).await } - #[cfg(test)] - pub async fn put_image( - &mut self, - key: Key, - img: Bytes, - tline: &Arc, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - self.put_image_with_discard_fn(key, img, tline, ctx, |_| async { false }) - .await - } - pub(crate) async fn finish_with_discard_fn( self, tline: &Arc, ctx: &RequestContext, end_key: Key, - discard: D, + discard_fn: D, ) -> anyhow::Result> where - D: FnOnce(&PersistentLayerKey) -> F, + D: Fn(&PersistentLayerKey) -> F, F: Future, { let Self { - mut generated_layers, + mut generated_layer_writers, inner, .. } = self; - if inner.num_keys() == 0 { - return Ok(generated_layers); + if inner.num_keys() != 0 { + let layer_key = PersistentLayerKey { + key_range: self.start_key..end_key, + lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn), + is_delta: false, + }; + generated_layer_writers.push((inner, layer_key)); } - let layer_key = PersistentLayerKey { - key_range: self.start_key..end_key, - lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn), - is_delta: false, + let clean_up_layers = |generated_layers: Vec| { + for produced_layer in generated_layers { + if let SplitWriterResult::Produced(image_layer) = produced_layer { + let layer: Layer = image_layer.into(); + layer.delete_on_drop(); + } + } }; - if discard(&layer_key).await { - generated_layers.push(SplitWriterResult::Discarded(layer_key)); - } else { - let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?; - let layer = Layer::finish_creating(self.conf, tline, desc, &path)?; - generated_layers.push(SplitWriterResult::Produced(layer)); + // BEGIN: catch every error and do the recovery in the below section + let mut generated_layers = Vec::new(); + for (inner, layer_key) in generated_layer_writers { + if discard_fn(&layer_key).await { + generated_layers.push(SplitWriterResult::Discarded(layer_key)); + } else { + let layer = match inner + .finish_with_end_key(layer_key.key_range.end, ctx) + .await + { + Ok((desc, path)) => { + match Layer::finish_creating(self.conf, tline, desc, &path) { + Ok(layer) => layer, + Err(e) => { + tokio::fs::remove_file(&path).await.ok(); + clean_up_layers(generated_layers); + return Err(e); + } + } + } + Err(e) => { + // ImageLayerWriter::finish will clean up the temporary layer if anything goes wrong, + // so we don't need to remove the layer we just failed to create by ourselves. + clean_up_layers(generated_layers); + return Err(e); + } + }; + generated_layers.push(SplitWriterResult::Produced(layer)); + } } + // END: catch every error and do the recovery in the above section Ok(generated_layers) } @@ -187,11 +192,6 @@ impl SplitImageLayerWriter { self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false }) .await } - - /// This function will be deprecated with #8841. - pub(crate) fn take(self) -> anyhow::Result<(Vec, ImageLayerWriter)> { - Ok((self.generated_layers, self.inner)) - } } /// A delta writer that takes key-lsn-values and produces multiple delta layers. @@ -206,7 +206,7 @@ impl SplitImageLayerWriter { pub struct SplitDeltaLayerWriter { inner: Option<(Key, DeltaLayerWriter)>, target_layer_size: u64, - generated_layers: Vec, + generated_layer_writers: Vec<(DeltaLayerWriter, PersistentLayerKey)>, conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, @@ -225,7 +225,7 @@ impl SplitDeltaLayerWriter { Ok(Self { target_layer_size, inner: None, - generated_layers: Vec::new(), + generated_layer_writers: Vec::new(), conf, timeline_id, tenant_shard_id, @@ -234,20 +234,13 @@ impl SplitDeltaLayerWriter { }) } - /// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end. - pub async fn put_value_with_discard_fn( + pub async fn put_value( &mut self, key: Key, lsn: Lsn, val: Value, - tline: &Arc, ctx: &RequestContext, - discard: D, - ) -> anyhow::Result<()> - where - D: FnOnce(&PersistentLayerKey) -> F, - F: Future, - { + ) -> anyhow::Result<()> { // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate // number, and therefore the final layer size could be a little bit larger or smaller than the target. // @@ -291,16 +284,8 @@ impl SplitDeltaLayerWriter { lsn_range: self.lsn_range.clone(), is_delta: true, }; - if discard(&layer_key).await { - drop(prev_delta_writer); - self.generated_layers - .push(SplitWriterResult::Discarded(layer_key)); - } else { - let (desc, path) = prev_delta_writer.finish(key, ctx).await?; - let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?; - self.generated_layers - .push(SplitWriterResult::Produced(delta_layer)); - } + self.generated_layer_writers + .push((prev_delta_writer, layer_key)); } else if inner.estimated_size() >= S3_UPLOAD_LIMIT { // We have to produce a very large file b/c a key is updated too often. anyhow::bail!( @@ -315,52 +300,68 @@ impl SplitDeltaLayerWriter { inner.put_value(key, lsn, val, ctx).await } - pub async fn put_value( - &mut self, - key: Key, - lsn: Lsn, - val: Value, - tline: &Arc, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false }) - .await - } - pub(crate) async fn finish_with_discard_fn( self, tline: &Arc, ctx: &RequestContext, - discard: D, + discard_fn: D, ) -> anyhow::Result> where - D: FnOnce(&PersistentLayerKey) -> F, + D: Fn(&PersistentLayerKey) -> F, F: Future, { let Self { - mut generated_layers, + mut generated_layer_writers, inner, .. } = self; - let Some((start_key, inner)) = inner else { - return Ok(generated_layers); - }; - if inner.num_keys() == 0 { - return Ok(generated_layers); + if let Some((start_key, writer)) = inner { + if writer.num_keys() != 0 { + let end_key = self.last_key_written.next(); + let layer_key = PersistentLayerKey { + key_range: start_key..end_key, + lsn_range: self.lsn_range.clone(), + is_delta: true, + }; + generated_layer_writers.push((writer, layer_key)); + } } - let end_key = self.last_key_written.next(); - let layer_key = PersistentLayerKey { - key_range: start_key..end_key, - lsn_range: self.lsn_range.clone(), - is_delta: true, + let clean_up_layers = |generated_layers: Vec| { + for produced_layer in generated_layers { + if let SplitWriterResult::Produced(delta_layer) = produced_layer { + let layer: Layer = delta_layer.into(); + layer.delete_on_drop(); + } + } }; - if discard(&layer_key).await { - generated_layers.push(SplitWriterResult::Discarded(layer_key)); - } else { - let (desc, path) = inner.finish(end_key, ctx).await?; - let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?; - generated_layers.push(SplitWriterResult::Produced(delta_layer)); + // BEGIN: catch every error and do the recovery in the below section + let mut generated_layers = Vec::new(); + for (inner, layer_key) in generated_layer_writers { + if discard_fn(&layer_key).await { + generated_layers.push(SplitWriterResult::Discarded(layer_key)); + } else { + let layer = match inner.finish(layer_key.key_range.end, ctx).await { + Ok((desc, path)) => { + match Layer::finish_creating(self.conf, tline, desc, &path) { + Ok(layer) => layer, + Err(e) => { + tokio::fs::remove_file(&path).await.ok(); + clean_up_layers(generated_layers); + return Err(e); + } + } + } + Err(e) => { + // DeltaLayerWriter::finish will clean up the temporary layer if anything goes wrong, + // so we don't need to remove the layer we just failed to create by ourselves. + clean_up_layers(generated_layers); + return Err(e); + } + }; + generated_layers.push(SplitWriterResult::Produced(layer)); + } } + // END: catch every error and do the recovery in the above section Ok(generated_layers) } @@ -373,11 +374,6 @@ impl SplitDeltaLayerWriter { self.finish_with_discard_fn(tline, ctx, |_| async { false }) .await } - - /// This function will be deprecated with #8841. - pub(crate) fn take(self) -> anyhow::Result<(Vec, Option)> { - Ok((self.generated_layers, self.inner.map(|x| x.1))) - } } #[cfg(test)] @@ -447,7 +443,7 @@ mod tests { .unwrap(); image_writer - .put_image(get_key(0), get_img(0), &tline, &ctx) + .put_image(get_key(0), get_img(0), &ctx) .await .unwrap(); let layers = image_writer @@ -457,13 +453,7 @@ mod tests { assert_eq!(layers.len(), 1); delta_writer - .put_value( - get_key(0), - Lsn(0x18), - Value::Image(get_img(0)), - &tline, - &ctx, - ) + .put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx) .await .unwrap(); let layers = delta_writer.finish(&tline, &ctx).await.unwrap(); @@ -486,14 +476,18 @@ mod tests { #[tokio::test] async fn write_split() { + // Test the split writer with retaining all the layers we have produced (discard=false) write_split_helper("split_writer_write_split", false).await; } #[tokio::test] async fn write_split_discard() { - write_split_helper("split_writer_write_split_discard", false).await; + // Test the split writer with discarding all the layers we have produced (discard=true) + write_split_helper("split_writer_write_split_discard", true).await; } + /// Test the image+delta writer by writing a large number of images and deltas. If discard is + /// set to true, all layers will be discarded. async fn write_split_helper(harness_name: &'static str, discard: bool) { let harness = TenantHarness::create(harness_name).await.unwrap(); let (tenant, ctx) = harness.load().await; @@ -527,69 +521,63 @@ mod tests { for i in 0..N { let i = i as u32; image_writer - .put_image_with_discard_fn(get_key(i), get_large_img(), &tline, &ctx, |_| async { - discard - }) + .put_image(get_key(i), get_large_img(), &ctx) .await .unwrap(); delta_writer - .put_value_with_discard_fn( - get_key(i), - Lsn(0x20), - Value::Image(get_large_img()), - &tline, - &ctx, - |_| async { discard }, - ) + .put_value(get_key(i), Lsn(0x20), Value::Image(get_large_img()), &ctx) .await .unwrap(); } let image_layers = image_writer - .finish(&tline, &ctx, get_key(N as u32)) + .finish_with_discard_fn(&tline, &ctx, get_key(N as u32), |_| async { discard }) .await .unwrap(); - let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap(); - if discard { - for layer in image_layers { - layer.into_discarded_layer(); - } - for layer in delta_layers { - layer.into_discarded_layer(); - } - } else { - let image_layers = image_layers - .into_iter() - .map(|x| x.into_resident_layer()) - .collect_vec(); - let delta_layers = delta_layers - .into_iter() - .map(|x| x.into_resident_layer()) - .collect_vec(); - assert_eq!(image_layers.len(), N / 512 + 1); - assert_eq!(delta_layers.len(), N / 512 + 1); - assert_eq!( - delta_layers.first().unwrap().layer_desc().key_range.start, - get_key(0) - ); - assert_eq!( - delta_layers.last().unwrap().layer_desc().key_range.end, - get_key(N as u32) - ); - for idx in 0..image_layers.len() { - assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN); - assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX); - assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN); - assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX); - if idx > 0 { - assert_eq!( - image_layers[idx - 1].layer_desc().key_range.end, - image_layers[idx].layer_desc().key_range.start - ); - assert_eq!( - delta_layers[idx - 1].layer_desc().key_range.end, - delta_layers[idx].layer_desc().key_range.start - ); + let delta_layers = delta_writer + .finish_with_discard_fn(&tline, &ctx, |_| async { discard }) + .await + .unwrap(); + let image_layers = image_layers + .into_iter() + .map(|x| { + if discard { + x.into_discarded_layer() + } else { + x.into_resident_layer().layer_desc().key() } + }) + .collect_vec(); + let delta_layers = delta_layers + .into_iter() + .map(|x| { + if discard { + x.into_discarded_layer() + } else { + x.into_resident_layer().layer_desc().key() + } + }) + .collect_vec(); + assert_eq!(image_layers.len(), N / 512 + 1); + assert_eq!(delta_layers.len(), N / 512 + 1); + assert_eq!(delta_layers.first().unwrap().key_range.start, get_key(0)); + assert_eq!( + delta_layers.last().unwrap().key_range.end, + get_key(N as u32) + ); + for idx in 0..image_layers.len() { + assert_ne!(image_layers[idx].key_range.start, Key::MIN); + assert_ne!(image_layers[idx].key_range.end, Key::MAX); + assert_ne!(delta_layers[idx].key_range.start, Key::MIN); + assert_ne!(delta_layers[idx].key_range.end, Key::MAX); + if idx > 0 { + assert_eq!( + image_layers[idx - 1].key_range.end, + image_layers[idx].key_range.start + ); + assert_eq!( + delta_layers[idx - 1].key_range.end, + delta_layers[idx].key_range.start + ); } } } @@ -629,11 +617,11 @@ mod tests { .unwrap(); image_writer - .put_image(get_key(0), get_img(0), &tline, &ctx) + .put_image(get_key(0), get_img(0), &ctx) .await .unwrap(); image_writer - .put_image(get_key(1), get_large_img(), &tline, &ctx) + .put_image(get_key(1), get_large_img(), &ctx) .await .unwrap(); let layers = image_writer @@ -643,23 +631,11 @@ mod tests { assert_eq!(layers.len(), 2); delta_writer - .put_value( - get_key(0), - Lsn(0x18), - Value::Image(get_img(0)), - &tline, - &ctx, - ) + .put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx) .await .unwrap(); delta_writer - .put_value( - get_key(1), - Lsn(0x1A), - Value::Image(get_large_img()), - &tline, - &ctx, - ) + .put_value(get_key(1), Lsn(0x1A), Value::Image(get_large_img()), &ctx) .await .unwrap(); let layers = delta_writer.finish(&tline, &ctx).await.unwrap(); @@ -723,7 +699,6 @@ mod tests { get_key(0), Lsn(i as u64 * 16 + 0x10), Value::Image(get_large_img()), - &tline, &ctx, ) .await diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1992dee930..d5ceec663b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -28,9 +28,9 @@ use pageserver_api::{ }, keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ - AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings, - DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, - InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState, + CompactionAlgorithm, CompactionAlgorithmSettings, DownloadRemoteLayersTaskInfo, + DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, + LsnLease, TimelineState, }, reltag::BlockNumber, shard::{ShardIdentity, ShardNumber, TenantShardId}, @@ -98,12 +98,12 @@ use crate::{ use crate::{ metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, }; -use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; -use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey}; use crate::{ - pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, + pgdatadir_mapping::DirectoryKind, virtual_file::{MaybeFatalIo, VirtualFile}, }; +use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; +use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey}; use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use crate::config::PageServerConf; @@ -206,11 +206,6 @@ pub struct TimelineResources { pub l0_flush_global_state: l0_flush::L0FlushGlobalState, } -pub(crate) struct AuxFilesState { - pub(crate) dir: Option, - pub(crate) n_deltas: usize, -} - /// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL /// ingestion considerably, because WAL ingestion needs to check on most records if the record /// implicitly extends the relation. At startup, `complete_as_of` is initialized to the current end @@ -376,7 +371,7 @@ pub struct Timeline { /// Prevent two tasks from deleting the timeline at the same time. If held, the /// timeline is being deleted. If 'true', the timeline has already been deleted. - pub delete_progress: Arc>, + pub delete_progress: TimelineDeleteProgress, eviction_task_timeline_state: tokio::sync::Mutex, @@ -413,15 +408,9 @@ pub struct Timeline { timeline_get_throttle: Arc>, - /// Keep aux directory cache to avoid it's reconstruction on each update - pub(crate) aux_files: tokio::sync::Mutex, - /// Size estimator for aux file v2 pub(crate) aux_file_size_estimator: AuxFileSizeEstimator, - /// Indicate whether aux file v2 storage is enabled. - pub(crate) last_aux_file_policy: AtomicAuxFilePolicy, - /// Some test cases directly place keys into the timeline without actually modifying the directory /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that /// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense @@ -437,6 +426,8 @@ pub struct Timeline { pub(crate) attach_wal_lag_cooldown: Arc>, } +pub type TimelineDeleteProgress = Arc>; + pub struct WalReceiverInfo { pub wal_source_connconf: PgConnectionConfig, pub last_received_msg_lsn: Lsn, @@ -1565,6 +1556,7 @@ impl Timeline { } /// Checks if the internal state of the timeline is consistent with it being able to be offloaded. + /// /// This is neccessary but not sufficient for offloading of the timeline as it might have /// child timelines that are not offloaded yet. pub(crate) fn can_offload(&self) -> bool { @@ -2011,14 +2003,6 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts) } - pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy { - let tenant_conf = self.tenant_conf.load(); - tenant_conf - .tenant_conf - .switch_aux_file_policy - .unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy) - } - pub(crate) fn get_lazy_slru_download(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2151,7 +2135,6 @@ impl Timeline { resources: TimelineResources, pg_version: u32, state: TimelineState, - aux_file_policy: Option, attach_wal_lag_cooldown: Arc>, cancel: CancellationToken, ) -> Arc { @@ -2269,7 +2252,7 @@ impl Timeline { eviction_task_timeline_state: tokio::sync::Mutex::new( EvictionTaskTimelineState::default(), ), - delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())), + delete_progress: TimelineDeleteProgress::default(), cancel, gate: Gate::default(), @@ -2281,15 +2264,8 @@ impl Timeline { timeline_get_throttle: resources.timeline_get_throttle, - aux_files: tokio::sync::Mutex::new(AuxFilesState { - dir: None, - n_deltas: 0, - }), - aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics), - last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy), - #[cfg(test)] extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())), @@ -2300,10 +2276,6 @@ impl Timeline { attach_wal_lag_cooldown, }; - if aux_file_policy == Some(AuxFilePolicy::V1) { - warn!("this timeline is using deprecated aux file policy V1 (when loading the timeline)"); - } - result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; @@ -4478,14 +4450,6 @@ impl Timeline { ) -> Result<(), detach_ancestor::Error> { detach_ancestor::complete(self, tenant, attempt, ctx).await } - - /// Switch aux file policy and schedule upload to the index part. - pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> { - self.last_aux_file_policy.store(Some(policy)); - self.remote_client - .schedule_index_upload_for_aux_file_policy_update(Some(policy))?; - Ok(()) - } } impl Drop for Timeline { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 8b9ace1e5b..37d907ddcb 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -29,6 +29,7 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; +use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::filter_iterator::FilterIterator; @@ -120,18 +121,12 @@ impl KeyHistoryRetention { async fn pipe_to( self, key: Key, - tline: &Arc, delta_writer: &mut SplitDeltaLayerWriter, mut image_writer: Option<&mut SplitImageLayerWriter>, stat: &mut CompactionStatistics, - dry_run: bool, ctx: &RequestContext, ) -> anyhow::Result<()> { let mut first_batch = true; - let discard = |key: &PersistentLayerKey| { - let key = key.clone(); - async move { Self::discard_key(&key, tline, dry_run).await } - }; for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon { if first_batch { if logs.len() == 1 && logs[0].1.is_image() { @@ -140,45 +135,30 @@ impl KeyHistoryRetention { }; stat.produce_image_key(img); if let Some(image_writer) = image_writer.as_mut() { - image_writer - .put_image_with_discard_fn(key, img.clone(), tline, ctx, discard) - .await?; + image_writer.put_image(key, img.clone(), ctx).await?; } else { delta_writer - .put_value_with_discard_fn( - key, - cutoff_lsn, - Value::Image(img.clone()), - tline, - ctx, - discard, - ) + .put_value(key, cutoff_lsn, Value::Image(img.clone()), ctx) .await?; } } else { for (lsn, val) in logs { stat.produce_key(&val); - delta_writer - .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard) - .await?; + delta_writer.put_value(key, lsn, val, ctx).await?; } } first_batch = false; } else { for (lsn, val) in logs { stat.produce_key(&val); - delta_writer - .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard) - .await?; + delta_writer.put_value(key, lsn, val, ctx).await?; } } } let KeyLogAtLsn(above_horizon_logs) = self.above_horizon; for (lsn, val) in above_horizon_logs { stat.produce_key(&val); - delta_writer - .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard) - .await?; + delta_writer.put_value(key, lsn, val, ctx).await?; } Ok(()) } @@ -1691,6 +1671,45 @@ impl Timeline { unreachable!("key retention is empty") } + /// Check how much space is left on the disk + async fn check_available_space(self: &Arc) -> anyhow::Result { + let tenants_dir = self.conf.tenants_path(); + + let stat = Statvfs::get(&tenants_dir, None) + .context("statvfs failed, presumably directory got unlinked")?; + + let (avail_bytes, _) = stat.get_avail_total_bytes(); + + Ok(avail_bytes) + } + + /// Check if the compaction can proceed safely without running out of space. We assume the size + /// upper bound of the produced files of a compaction job is the same as all layers involved in + /// the compaction. Therefore, we need `2 * layers_to_be_compacted_size` at least to do a + /// compaction. + async fn check_compaction_space( + self: &Arc, + layer_selection: &[Layer], + ) -> anyhow::Result<()> { + let available_space = self.check_available_space().await?; + let mut remote_layer_size = 0; + let mut all_layer_size = 0; + for layer in layer_selection { + let needs_download = layer.needs_download().await?; + if needs_download.is_some() { + remote_layer_size += layer.layer_desc().file_size; + } + all_layer_size += layer.layer_desc().file_size; + } + let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */ + if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space + { + return Err(anyhow!("not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", + available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size)); + } + Ok(()) + } + /// An experimental compaction building block that combines compaction with garbage collection. /// /// The current implementation picks all delta + image layers that are below or intersecting with @@ -1806,6 +1825,8 @@ impl Timeline { lowest_retain_lsn ); + self.check_compaction_space(&layer_selection).await?; + // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs. // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point. let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) @@ -1948,11 +1969,9 @@ impl Timeline { retention .pipe_to( *last_key, - self, &mut delta_layer_writer, image_layer_writer.as_mut(), &mut stat, - dry_run, ctx, ) .await?; @@ -1979,11 +1998,9 @@ impl Timeline { retention .pipe_to( last_key, - self, &mut delta_layer_writer, image_layer_writer.as_mut(), &mut stat, - dry_run, ctx, ) .await?; @@ -1999,8 +2016,7 @@ impl Timeline { .finish_with_discard_fn(self, ctx, Key::MAX, discard) .await? } else { - let (layers, _) = writer.take()?; - assert!(layers.is_empty(), "image layers produced in dry run mode?"); + drop(writer); Vec::new() } } else { @@ -2012,8 +2028,7 @@ impl Timeline { .finish_with_discard_fn(self, ctx, discard) .await? } else { - let (layers, _) = delta_layer_writer.take()?; - assert!(layers.is_empty(), "delta layers produced in dry run mode?"); + drop(delta_layer_writer); Vec::new() }; diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 305c5758cc..4799aab436 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -14,7 +14,9 @@ use crate::{ task_mgr::{self, TaskKind}, tenant::{ metadata::TimelineMetadata, - remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, + remote_timeline_client::{ + self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient, + }, CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded, }, }; @@ -25,12 +27,9 @@ use super::{Timeline, TimelineResources}; /// during attach or pageserver restart. /// See comment in persist_index_part_with_deleted_flag. async fn set_deleted_in_remote_index( - timeline: &TimelineOrOffloaded, + remote_client: &Arc, ) -> Result<(), DeleteTimelineError> { - let res = timeline - .remote_client() - .persist_index_part_with_deleted_flag() - .await; + let res = remote_client.persist_index_part_with_deleted_flag().await; match res { // If we (now, or already) marked it successfully as deleted, we can proceed Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), @@ -129,12 +128,10 @@ pub(super) async fn delete_local_timeline_directory( } /// Removes remote layers and an index file after them. -async fn delete_remote_layers_and_index(timeline: &TimelineOrOffloaded) -> anyhow::Result<()> { - timeline - .remote_client() - .delete_all() - .await - .context("delete_all") +async fn delete_remote_layers_and_index( + remote_client: &Arc, +) -> anyhow::Result<()> { + remote_client.delete_all().await.context("delete_all") } /// It is important that this gets called when DeletionGuard is being held. @@ -179,6 +176,32 @@ async fn remove_maybe_offloaded_timeline_from_tenant( Ok(()) } +/// It is important that this gets called when DeletionGuard is being held. +/// For more context see comments in [`DeleteTimelineFlow::prepare`] +async fn upload_new_tenant_manifest( + tenant: &Tenant, + _: &DeletionGuard, // using it as a witness +) -> anyhow::Result<()> { + // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash + // between the deletion of the index-part.json and reaching of this code. + // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted. + // However, we handle this case in tenant loading code so the next time we attach, the issue is + // resolved. + let manifest = tenant.tenant_manifest(); + // TODO: generation support + let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; + remote_timeline_client::upload_tenant_manifest( + &tenant.remote_storage, + &tenant.tenant_shard_id, + generation, + &manifest, + &tenant.cancel, + ) + .await?; + + Ok(()) +} + /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures, /// and deletes its data from both disk and s3. /// The sequence of steps: @@ -235,7 +258,8 @@ impl DeleteTimelineFlow { ))? }); - set_deleted_in_remote_index(&timeline).await?; + let remote_client = timeline.remote_client_maybe_construct(tenant); + set_deleted_in_remote_index(&remote_client).await?; fail::fail_point!("timeline-delete-before-schedule", |_| { Err(anyhow::anyhow!( @@ -243,7 +267,13 @@ impl DeleteTimelineFlow { ))? }); - Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline); + Self::schedule_background( + guard, + tenant.conf, + Arc::clone(tenant), + timeline, + remote_client, + ); Ok(()) } @@ -283,8 +313,6 @@ impl DeleteTimelineFlow { // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. CreateTimelineCause::Delete, - // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace - None, ) .context("create_timeline_struct")?; @@ -303,8 +331,9 @@ impl DeleteTimelineFlow { guard.mark_in_progress()?; + let remote_client = timeline.remote_client.clone(); let timeline = TimelineOrOffloaded::Timeline(timeline); - Self::schedule_background(guard, tenant.conf, tenant, timeline); + Self::schedule_background(guard, tenant.conf, tenant, timeline, remote_client); Ok(()) } @@ -382,6 +411,7 @@ impl DeleteTimelineFlow { conf: &'static PageServerConf, tenant: Arc, timeline: TimelineOrOffloaded, + remote_client: Arc, ) { let tenant_shard_id = timeline.tenant_shard_id(); let timeline_id = timeline.timeline_id(); @@ -393,7 +423,7 @@ impl DeleteTimelineFlow { Some(timeline_id), "timeline_delete", async move { - if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await { + if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await { error!("Error: {err:#}"); if let TimelineOrOffloaded::Timeline(timeline) = timeline { timeline.set_broken(format!("{err:#}")) @@ -410,6 +440,7 @@ impl DeleteTimelineFlow { conf: &PageServerConf, tenant: &Tenant, timeline: &TimelineOrOffloaded, + remote_client: Arc, ) -> Result<(), DeleteTimelineError> { // Offloaded timelines have no local state // TODO: once we persist offloaded information, delete the timeline from there, too @@ -417,12 +448,14 @@ impl DeleteTimelineFlow { delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?; } - delete_remote_layers_and_index(timeline).await?; + delete_remote_layers_and_index(&remote_client).await?; pausable_failpoint!("in_progress_delete"); remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?; + upload_new_tenant_manifest(tenant, &guard).await?; + *guard = Self::Finished; Ok(()) diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 7e6084baaf..8e6eceb084 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -1,17 +1,17 @@ use std::sync::Arc; -use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded}; - -use super::{ - delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard}, - Timeline, -}; +use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard}; +use super::Timeline; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::{remote_timeline_client, OffloadedTimeline, Tenant, TimelineOrOffloaded}; pub(crate) async fn offload_timeline( tenant: &Tenant, timeline: &Arc, ) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_and_timeline_id(); tracing::info!("offloading archived timeline"); + let (timeline, guard) = DeleteTimelineFlow::prepare(tenant, timeline.timeline_id)?; let TimelineOrOffloaded::Timeline(timeline) = timeline else { @@ -19,14 +19,28 @@ pub(crate) async fn offload_timeline( return Ok(()); }; + let is_archived = timeline.is_archived(); + match is_archived { + Some(true) => (), + Some(false) => { + tracing::warn!(?is_archived, "tried offloading a non-archived timeline"); + anyhow::bail!("timeline isn't archived"); + } + None => { + tracing::warn!( + ?is_archived, + "tried offloading a timeline where manifest is not yet available" + ); + anyhow::bail!("timeline manifest hasn't been loaded yet"); + } + } + // Now that the Timeline is in Stopping state, request all the related tasks to shut down. timeline.shutdown(super::ShutdownMode::Hard).await; // TODO extend guard mechanism above with method // to make deletions possible while offloading is in progress - // TODO mark timeline as offloaded in S3 - let conf = &tenant.conf; delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?; @@ -36,10 +50,31 @@ pub(crate) async fn offload_timeline( let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap(); offloaded_timelines.insert( timeline.timeline_id, - Arc::new(OffloadedTimeline::from_timeline(&timeline)), + Arc::new( + OffloadedTimeline::from_timeline(&timeline) + .expect("we checked above that timeline was ready"), + ), ); } + // Last step: mark timeline as offloaded in S3 + // TODO: maybe move this step above, right above deletion of the local timeline directory, + // then there is no potential race condition where we partially offload a timeline, and + // at the next restart attach it again. + // For that to happen, we'd need to make the manifest reflect our *intended* state, + // not our actual state of offloaded timelines. + let manifest = tenant.tenant_manifest(); + // TODO: generation support + let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; + remote_timeline_client::upload_tenant_manifest( + &tenant.remote_storage, + &tenant.tenant_shard_id, + generation, + &manifest, + &tenant.cancel, + ) + .await?; + Ok(()) } diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 792c769b4f..dfe2352310 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -18,7 +18,7 @@ use std::collections::BTreeMap; use std::ops::Deref; -use bytes::{Bytes, BytesMut}; +use bytes::Bytes; use pageserver_api::key::Key; use tokio::io::AsyncWriteExt; use tokio_epoll_uring::BoundedBuf; @@ -27,6 +27,7 @@ use utils::vec_map::VecMap; use crate::context::RequestContext; use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK}; +use crate::virtual_file::IoBufferMut; use crate::virtual_file::{self, VirtualFile}; /// Metadata bundled with the start and end offset of a blob. @@ -73,7 +74,7 @@ impl<'a> BufView<'a> { } } -impl<'a> Deref for BufView<'a> { +impl Deref for BufView<'_> { type Target = [u8]; fn deref(&self) -> &Self::Target { @@ -84,7 +85,7 @@ impl<'a> Deref for BufView<'a> { } } -impl<'a> AsRef<[u8]> for BufView<'a> { +impl AsRef<[u8]> for BufView<'_> { fn as_ref(&self) -> &[u8] { match self { BufView::Slice(slice) => slice, @@ -158,7 +159,7 @@ impl std::fmt::Display for VectoredBlob { /// Return type of [`VectoredBlobReader::read_blobs`] pub struct VectoredBlobsBuf { /// Buffer for all blobs in this read - pub buf: BytesMut, + pub buf: IoBufferMut, /// Offsets into the buffer and metadata for all blobs in this read pub blobs: Vec, } @@ -196,11 +197,6 @@ pub(crate) struct ChunkedVectoredReadBuilder { max_read_size: Option, } -/// Computes x / d rounded up. -fn div_round_up(x: usize, d: usize) -> usize { - (x + (d - 1)) / d -} - impl ChunkedVectoredReadBuilder { const CHUNK_SIZE: usize = virtual_file::get_io_buffer_alignment(); /// Start building a new vectored read. @@ -220,7 +216,7 @@ impl ChunkedVectoredReadBuilder { .expect("First insertion always succeeds"); let start_blk_no = start_offset as usize / Self::CHUNK_SIZE; - let end_blk_no = div_round_up(end_offset as usize, Self::CHUNK_SIZE); + let end_blk_no = (end_offset as usize).div_ceil(Self::CHUNK_SIZE); Self { start_blk_no, end_blk_no, @@ -248,7 +244,7 @@ impl ChunkedVectoredReadBuilder { pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { tracing::trace!(start, end, "trying to extend"); let start_blk_no = start as usize / Self::CHUNK_SIZE; - let end_blk_no = div_round_up(end as usize, Self::CHUNK_SIZE); + let end_blk_no = (end as usize).div_ceil(Self::CHUNK_SIZE); let not_limited_by_max_read_size = { if let Some(max_read_size) = self.max_read_size { @@ -446,7 +442,7 @@ impl<'a> VectoredBlobReader<'a> { pub async fn read_blobs( &self, read: &VectoredRead, - buf: BytesMut, + buf: IoBufferMut, ctx: &RequestContext, ) -> Result { assert!(read.size() > 0); @@ -921,7 +917,7 @@ mod tests { // Multiply by two (compressed data might need more space), and add a few bytes for the header let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16; - let mut buf = BytesMut::with_capacity(reserved_bytes); + let mut buf = IoBufferMut::with_capacity(reserved_bytes); let vectored_blob_reader = VectoredBlobReader::new(&file); let meta = BlobMeta { @@ -975,12 +971,4 @@ mod tests { round_trip_test_compressed(&blobs, true).await?; Ok(()) } - - #[test] - fn test_div_round_up() { - const CHUNK_SIZE: usize = 512; - assert_eq!(1, div_round_up(200, CHUNK_SIZE)); - assert_eq!(1, div_round_up(CHUNK_SIZE, CHUNK_SIZE)); - assert_eq!(2, div_round_up(CHUNK_SIZE + 1, CHUNK_SIZE)); - } } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index d260116b38..daa8b99ab0 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -18,6 +18,9 @@ use crate::page_cache::{PageWriteGuard, PAGE_SZ}; use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; +use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer; +use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign}; +use owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use owned_buffers_io::io_buf_ext::FullSlice; use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use pageserver_api::shard::TenantShardId; @@ -55,6 +58,8 @@ pub(crate) mod owned_buffers_io { //! but for the time being we're proving out the primitives in the neon.git repo //! for faster iteration. + pub(crate) mod aligned_buffer; + pub(crate) mod io_buf_aligned; pub(crate) mod io_buf_ext; pub(crate) mod slice; pub(crate) mod write; @@ -196,7 +201,7 @@ impl VirtualFile { ctx: &RequestContext, ) -> Result, Error> where - Buf: IoBufMut + Send, + Buf: IoBufAlignedMut + Send, { self.inner.read_exact_at(slice, offset, ctx).await } @@ -724,9 +729,9 @@ impl VirtualFileInner { *handle_guard = handle; - return Ok(FileGuard { + Ok(FileGuard { slot_guard: slot_guard.downgrade(), - }); + }) } pub fn remove(self) { @@ -771,7 +776,7 @@ impl VirtualFileInner { ctx: &RequestContext, ) -> Result, Error> where - Buf: IoBufMut + Send, + Buf: IoBufAlignedMut + Send, { let assert_we_return_original_bounds = if cfg!(debug_assertions) { Some((slice.stable_ptr() as usize, slice.bytes_total())) @@ -1222,12 +1227,14 @@ impl VirtualFileInner { ctx: &RequestContext, ) -> Result, std::io::Error> { use crate::page_cache::PAGE_SZ; - let slice = Vec::with_capacity(PAGE_SZ).slice_full(); + let slice = IoBufferMut::with_capacity(PAGE_SZ).slice_full(); assert_eq!(slice.bytes_total(), PAGE_SZ); let slice = self .read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx) .await?; - Ok(crate::tenant::block_io::BlockLease::Vec(slice.into_inner())) + Ok(crate::tenant::block_io::BlockLease::IoBufferMut( + slice.into_inner(), + )) } async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { @@ -1325,10 +1332,11 @@ impl OpenFiles { /// server startup. /// #[cfg(not(test))] -pub fn init(num_slots: usize, engine: IoEngineKind) { +pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode) { if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() { panic!("virtual_file::init called twice"); } + set_io_mode(mode); io_engine::init(engine); crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64); } @@ -1357,6 +1365,11 @@ pub(crate) const fn get_io_buffer_alignment() -> usize { DEFAULT_IO_BUFFER_ALIGNMENT } +pub(crate) type IoBufferMut = AlignedBufferMut>; +pub(crate) type IoBuffer = AlignedBuffer>; +pub(crate) type IoPageSlice<'a> = + AlignedSlice<'a, PAGE_SZ, ConstAlign<{ get_io_buffer_alignment() }>>; + static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8); pub(crate) fn set_io_mode(mode: IoMode) { @@ -1395,10 +1408,10 @@ mod tests { impl MaybeVirtualFile { async fn read_exact_at( &self, - mut slice: tokio_epoll_uring::Slice>, + mut slice: tokio_epoll_uring::Slice, offset: u64, ctx: &RequestContext, - ) -> Result>, Error> { + ) -> Result, Error> { match self { MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await, MaybeVirtualFile::File(file) => { @@ -1466,12 +1479,13 @@ mod tests { len: usize, ctx: &RequestContext, ) -> Result { - let slice = Vec::with_capacity(len).slice_full(); + let slice = IoBufferMut::with_capacity(len).slice_full(); assert_eq!(slice.bytes_total(), len); let slice = self.read_exact_at(slice, pos, ctx).await?; - let vec = slice.into_inner(); - assert_eq!(vec.len(), len); - Ok(String::from_utf8(vec).unwrap()) + let buf = slice.into_inner(); + assert_eq!(buf.len(), len); + + Ok(String::from_utf8(buf.to_vec()).unwrap()) } } @@ -1695,7 +1709,7 @@ mod tests { let files = files.clone(); let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error); let hdl = rt.spawn(async move { - let mut buf = vec![0u8; SIZE]; + let mut buf = IoBufferMut::with_capacity_zeroed(SIZE); let mut rng = rand::rngs::OsRng; for _ in 1..1000 { let f = &files[rng.gen_range(0..files.len())]; @@ -1704,7 +1718,7 @@ mod tests { .await .unwrap() .into_inner(); - assert!(buf == SAMPLE); + assert!(buf[..] == SAMPLE); } }); hdls.push(hdl); diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs new file mode 100644 index 0000000000..8ffc29b93d --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs @@ -0,0 +1,9 @@ +pub mod alignment; +pub mod buffer; +pub mod buffer_mut; +pub mod raw; +pub mod slice; + +pub use alignment::*; +pub use buffer_mut::AlignedBufferMut; +pub use slice::AlignedSlice; diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs new file mode 100644 index 0000000000..933b78a13b --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs @@ -0,0 +1,26 @@ +pub trait Alignment: std::marker::Unpin + 'static { + /// Returns the required alignments. + fn align(&self) -> usize; +} + +/// Alignment at compile time. +#[derive(Debug)] +pub struct ConstAlign; + +impl Alignment for ConstAlign { + fn align(&self) -> usize { + A + } +} + +/// Alignment at run time. +#[derive(Debug)] +pub struct RuntimeAlign { + align: usize, +} + +impl Alignment for RuntimeAlign { + fn align(&self) -> usize { + self.align + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs new file mode 100644 index 0000000000..2fba6d699b --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -0,0 +1,124 @@ +use std::{ + ops::{Deref, Range, RangeBounds}, + sync::Arc, +}; + +use super::{alignment::Alignment, raw::RawAlignedBuffer}; + +/// An shared, immutable aligned buffer type. +pub struct AlignedBuffer { + /// Shared raw buffer. + raw: Arc>, + /// Range that specifies the current slice. + range: Range, +} + +impl AlignedBuffer { + /// Creates an immutable `IoBuffer` from the raw buffer + pub(super) fn from_raw(raw: RawAlignedBuffer, range: Range) -> Self { + AlignedBuffer { + raw: Arc::new(raw), + range, + } + } + + /// Returns the number of bytes in the buffer, also referred to as its 'length'. + #[inline] + pub fn len(&self) -> usize { + self.range.len() + } + + /// Returns the alignment of the buffer. + #[inline] + pub fn align(&self) -> usize { + self.raw.align() + } + + #[inline] + fn as_ptr(&self) -> *const u8 { + // SAFETY: `self.range.start` is guaranteed to be within [0, self.len()). + unsafe { self.raw.as_ptr().add(self.range.start) } + } + + /// Extracts a slice containing the entire buffer. + /// + /// Equivalent to `&s[..]`. + #[inline] + fn as_slice(&self) -> &[u8] { + &self.raw.as_slice()[self.range.start..self.range.end] + } + + /// Returns a slice of self for the index range `[begin..end)`. + pub fn slice(&self, range: impl RangeBounds) -> Self { + use core::ops::Bound; + let len = self.len(); + + let begin = match range.start_bound() { + Bound::Included(&n) => n, + Bound::Excluded(&n) => n.checked_add(1).expect("out of range"), + Bound::Unbounded => 0, + }; + + let end = match range.end_bound() { + Bound::Included(&n) => n.checked_add(1).expect("out of range"), + Bound::Excluded(&n) => n, + Bound::Unbounded => len, + }; + + assert!( + begin <= end, + "range start must not be greater than end: {:?} <= {:?}", + begin, + end, + ); + assert!( + end <= len, + "range end out of bounds: {:?} <= {:?}", + end, + len, + ); + + let begin = self.range.start + begin; + let end = self.range.start + end; + + AlignedBuffer { + raw: Arc::clone(&self.raw), + range: begin..end, + } + } +} + +impl Deref for AlignedBuffer { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl AsRef<[u8]> for AlignedBuffer { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl PartialEq<[u8]> for AlignedBuffer { + fn eq(&self, other: &[u8]) -> bool { + self.as_slice().eq(other) + } +} + +/// SAFETY: the underlying buffer references a stable memory region. +unsafe impl tokio_epoll_uring::IoBuf for AlignedBuffer { + fn stable_ptr(&self) -> *const u8 { + self.as_ptr() + } + + fn bytes_init(&self) -> usize { + self.len() + } + + fn bytes_total(&self) -> usize { + self.len() + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs new file mode 100644 index 0000000000..b3675d1aea --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -0,0 +1,347 @@ +use std::ops::{Deref, DerefMut}; + +use super::{ + alignment::{Alignment, ConstAlign}, + buffer::AlignedBuffer, + raw::RawAlignedBuffer, +}; + +/// A mutable aligned buffer type. +#[derive(Debug)] +pub struct AlignedBufferMut { + raw: RawAlignedBuffer, +} + +impl AlignedBufferMut> { + /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment. + /// + /// The buffer will be able to hold at most `capacity` elements and will never resize. + /// + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met: + /// * `align` must not be zero, + /// + /// * `align` must be a power of two, + /// + /// * `capacity`, when rounded up to the nearest multiple of `align`, + /// must not overflow isize (i.e., the rounded value must be + /// less than or equal to `isize::MAX`). + pub fn with_capacity(capacity: usize) -> Self { + AlignedBufferMut { + raw: RawAlignedBuffer::with_capacity(capacity), + } + } + + /// Constructs a new `IoBufferMut` with at least the specified capacity and alignment, filled with zeros. + pub fn with_capacity_zeroed(capacity: usize) -> Self { + use bytes::BufMut; + let mut buf = Self::with_capacity(capacity); + buf.put_bytes(0, capacity); + // SAFETY: `put_bytes` filled the entire buffer. + unsafe { buf.set_len(capacity) }; + buf + } +} + +impl AlignedBufferMut { + /// Returns the total number of bytes the buffer can hold. + #[inline] + pub fn capacity(&self) -> usize { + self.raw.capacity() + } + + /// Returns the alignment of the buffer. + #[inline] + pub fn align(&self) -> usize { + self.raw.align() + } + + /// Returns the number of bytes in the buffer, also referred to as its 'length'. + #[inline] + pub fn len(&self) -> usize { + self.raw.len() + } + + /// Force the length of the buffer to `new_len`. + #[inline] + unsafe fn set_len(&mut self, new_len: usize) { + self.raw.set_len(new_len) + } + + #[inline] + fn as_ptr(&self) -> *const u8 { + self.raw.as_ptr() + } + + #[inline] + fn as_mut_ptr(&mut self) -> *mut u8 { + self.raw.as_mut_ptr() + } + + /// Extracts a slice containing the entire buffer. + /// + /// Equivalent to `&s[..]`. + #[inline] + fn as_slice(&self) -> &[u8] { + self.raw.as_slice() + } + + /// Extracts a mutable slice of the entire buffer. + /// + /// Equivalent to `&mut s[..]`. + fn as_mut_slice(&mut self) -> &mut [u8] { + self.raw.as_mut_slice() + } + + /// Drops the all the contents of the buffer, setting its length to `0`. + #[inline] + pub fn clear(&mut self) { + self.raw.clear() + } + + /// Reserves capacity for at least `additional` more bytes to be inserted + /// in the given `IoBufferMut`. The collection may reserve more space to + /// speculatively avoid frequent reallocations. After calling `reserve`, + /// capacity will be greater than or equal to `self.len() + additional`. + /// Does nothing if capacity is already sufficient. + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` _bytes_. + pub fn reserve(&mut self, additional: usize) { + self.raw.reserve(additional); + } + + /// Shortens the buffer, keeping the first len bytes. + pub fn truncate(&mut self, len: usize) { + self.raw.truncate(len); + } + + /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8]. + pub fn leak<'a>(self) -> &'a mut [u8] { + self.raw.leak() + } + + pub fn freeze(self) -> AlignedBuffer { + let len = self.len(); + AlignedBuffer::from_raw(self.raw, 0..len) + } +} + +impl Deref for AlignedBufferMut { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl DerefMut for AlignedBufferMut { + fn deref_mut(&mut self) -> &mut Self::Target { + self.as_mut_slice() + } +} + +impl AsRef<[u8]> for AlignedBufferMut { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl AsMut<[u8]> for AlignedBufferMut { + fn as_mut(&mut self) -> &mut [u8] { + self.as_mut_slice() + } +} + +impl PartialEq<[u8]> for AlignedBufferMut { + fn eq(&self, other: &[u8]) -> bool { + self.as_slice().eq(other) + } +} + +/// SAFETY: When advancing the internal cursor, the caller needs to make sure the bytes advcanced past have been initialized. +unsafe impl bytes::BufMut for AlignedBufferMut { + #[inline] + fn remaining_mut(&self) -> usize { + // Although a `Vec` can have at most isize::MAX bytes, we never want to grow `IoBufferMut`. + // Thus, it can have at most `self.capacity` bytes. + self.capacity() - self.len() + } + + // SAFETY: Caller needs to make sure the bytes being advanced past have been initialized. + #[inline] + unsafe fn advance_mut(&mut self, cnt: usize) { + let len = self.len(); + let remaining = self.remaining_mut(); + + if remaining < cnt { + panic_advance(cnt, remaining); + } + + // Addition will not overflow since the sum is at most the capacity. + self.set_len(len + cnt); + } + + #[inline] + fn chunk_mut(&mut self) -> &mut bytes::buf::UninitSlice { + let cap = self.capacity(); + let len = self.len(); + + // SAFETY: Since `self.ptr` is valid for `cap` bytes, `self.ptr.add(len)` must be + // valid for `cap - len` bytes. The subtraction will not underflow since + // `len <= cap`. + unsafe { + bytes::buf::UninitSlice::from_raw_parts_mut(self.as_mut_ptr().add(len), cap - len) + } + } +} + +/// Panic with a nice error message. +#[cold] +fn panic_advance(idx: usize, len: usize) -> ! { + panic!( + "advance out of bounds: the len is {} but advancing by {}", + len, idx + ); +} + +/// Safety: [`AlignedBufferMut`] has exclusive ownership of the io buffer, +/// and the underlying pointer remains stable while io-uring is owning the buffer. +/// The tokio-epoll-uring crate itself will not resize the buffer and will respect +/// [`tokio_epoll_uring::IoBuf::bytes_total`]. +unsafe impl tokio_epoll_uring::IoBuf for AlignedBufferMut { + fn stable_ptr(&self) -> *const u8 { + self.as_ptr() + } + + fn bytes_init(&self) -> usize { + self.len() + } + + fn bytes_total(&self) -> usize { + self.capacity() + } +} + +// SAFETY: See above. +unsafe impl tokio_epoll_uring::IoBufMut for AlignedBufferMut { + fn stable_mut_ptr(&mut self) -> *mut u8 { + self.as_mut_ptr() + } + + unsafe fn set_init(&mut self, init_len: usize) { + if self.len() < init_len { + self.set_len(init_len); + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + const ALIGN: usize = 4 * 1024; + type TestIoBufferMut = AlignedBufferMut>; + + #[test] + fn test_with_capacity() { + let v = TestIoBufferMut::with_capacity(ALIGN * 4); + assert_eq!(v.len(), 0); + assert_eq!(v.capacity(), ALIGN * 4); + assert_eq!(v.align(), ALIGN); + assert_eq!(v.as_ptr().align_offset(ALIGN), 0); + + let v = TestIoBufferMut::with_capacity(ALIGN / 2); + assert_eq!(v.len(), 0); + assert_eq!(v.capacity(), ALIGN / 2); + assert_eq!(v.align(), ALIGN); + assert_eq!(v.as_ptr().align_offset(ALIGN), 0); + } + + #[test] + fn test_with_capacity_zeroed() { + let v = TestIoBufferMut::with_capacity_zeroed(ALIGN); + assert_eq!(v.len(), ALIGN); + assert_eq!(v.capacity(), ALIGN); + assert_eq!(v.align(), ALIGN); + assert_eq!(v.as_ptr().align_offset(ALIGN), 0); + assert_eq!(&v[..], &[0; ALIGN]) + } + + #[test] + fn test_reserve() { + use bytes::BufMut; + let mut v = TestIoBufferMut::with_capacity(ALIGN); + let capacity = v.capacity(); + v.reserve(capacity); + assert_eq!(v.capacity(), capacity); + let data = [b'a'; ALIGN]; + v.put(&data[..]); + v.reserve(capacity); + assert!(v.capacity() >= capacity * 2); + assert_eq!(&v[..], &data[..]); + let capacity = v.capacity(); + v.clear(); + v.reserve(capacity); + assert_eq!(capacity, v.capacity()); + } + + #[test] + fn test_bytes_put() { + use bytes::BufMut; + let mut v = TestIoBufferMut::with_capacity(ALIGN * 4); + let x = [b'a'; ALIGN]; + + for _ in 0..2 { + for _ in 0..4 { + v.put(&x[..]); + } + assert_eq!(v.len(), ALIGN * 4); + assert_eq!(v.capacity(), ALIGN * 4); + assert_eq!(v.align(), ALIGN); + assert_eq!(v.as_ptr().align_offset(ALIGN), 0); + v.clear() + } + assert_eq!(v.len(), 0); + assert_eq!(v.capacity(), ALIGN * 4); + assert_eq!(v.align(), ALIGN); + assert_eq!(v.as_ptr().align_offset(ALIGN), 0); + } + + #[test] + #[should_panic] + fn test_bytes_put_panic() { + use bytes::BufMut; + const ALIGN: usize = 4 * 1024; + let mut v = TestIoBufferMut::with_capacity(ALIGN * 4); + let x = [b'a'; ALIGN]; + for _ in 0..5 { + v.put_slice(&x[..]); + } + } + + #[test] + fn test_io_buf_put_slice() { + use tokio_epoll_uring::BoundedBufMut; + const ALIGN: usize = 4 * 1024; + let mut v = TestIoBufferMut::with_capacity(ALIGN); + let x = [b'a'; ALIGN]; + + for _ in 0..2 { + v.put_slice(&x[..]); + assert_eq!(v.len(), ALIGN); + assert_eq!(v.capacity(), ALIGN); + assert_eq!(v.align(), ALIGN); + assert_eq!(v.as_ptr().align_offset(ALIGN), 0); + v.clear() + } + assert_eq!(v.len(), 0); + assert_eq!(v.capacity(), ALIGN); + assert_eq!(v.align(), ALIGN); + assert_eq!(v.as_ptr().align_offset(ALIGN), 0); + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs new file mode 100644 index 0000000000..6c26dec0db --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs @@ -0,0 +1,216 @@ +use core::slice; +use std::{ + alloc::{self, Layout}, + cmp, + mem::ManuallyDrop, +}; + +use super::alignment::{Alignment, ConstAlign}; + +#[derive(Debug)] +struct AlignedBufferPtr(*mut u8); + +// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer. +unsafe impl Send for AlignedBufferPtr {} + +// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer. +unsafe impl Sync for AlignedBufferPtr {} + +/// An aligned buffer type. +#[derive(Debug)] +pub struct RawAlignedBuffer { + ptr: AlignedBufferPtr, + capacity: usize, + len: usize, + align: A, +} + +impl RawAlignedBuffer> { + /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment. + /// + /// The buffer will be able to hold at most `capacity` elements and will never resize. + /// + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met: + /// * `align` must not be zero, + /// + /// * `align` must be a power of two, + /// + /// * `capacity`, when rounded up to the nearest multiple of `align`, + /// must not overflow isize (i.e., the rounded value must be + /// less than or equal to `isize::MAX`). + pub fn with_capacity(capacity: usize) -> Self { + let align = ConstAlign::; + let layout = Layout::from_size_align(capacity, align.align()).expect("Invalid layout"); + + // SAFETY: Making an allocation with a sized and aligned layout. The memory is manually freed with the same layout. + let ptr = unsafe { + let ptr = alloc::alloc(layout); + if ptr.is_null() { + alloc::handle_alloc_error(layout); + } + AlignedBufferPtr(ptr) + }; + + RawAlignedBuffer { + ptr, + capacity, + len: 0, + align, + } + } +} + +impl RawAlignedBuffer { + /// Returns the total number of bytes the buffer can hold. + #[inline] + pub fn capacity(&self) -> usize { + self.capacity + } + + /// Returns the alignment of the buffer. + #[inline] + pub fn align(&self) -> usize { + self.align.align() + } + + /// Returns the number of bytes in the buffer, also referred to as its 'length'. + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Force the length of the buffer to `new_len`. + #[inline] + pub unsafe fn set_len(&mut self, new_len: usize) { + debug_assert!(new_len <= self.capacity()); + self.len = new_len; + } + + #[inline] + pub fn as_ptr(&self) -> *const u8 { + self.ptr.0 + } + + #[inline] + pub fn as_mut_ptr(&mut self) -> *mut u8 { + self.ptr.0 + } + + /// Extracts a slice containing the entire buffer. + /// + /// Equivalent to `&s[..]`. + #[inline] + pub fn as_slice(&self) -> &[u8] { + // SAFETY: The pointer is valid and `len` bytes are initialized. + unsafe { slice::from_raw_parts(self.as_ptr(), self.len) } + } + + /// Extracts a mutable slice of the entire buffer. + /// + /// Equivalent to `&mut s[..]`. + pub fn as_mut_slice(&mut self) -> &mut [u8] { + // SAFETY: The pointer is valid and `len` bytes are initialized. + unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) } + } + + /// Drops the all the contents of the buffer, setting its length to `0`. + #[inline] + pub fn clear(&mut self) { + self.len = 0; + } + + /// Reserves capacity for at least `additional` more bytes to be inserted + /// in the given `IoBufferMut`. The collection may reserve more space to + /// speculatively avoid frequent reallocations. After calling `reserve`, + /// capacity will be greater than or equal to `self.len() + additional`. + /// Does nothing if capacity is already sufficient. + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` _bytes_. + pub fn reserve(&mut self, additional: usize) { + if additional > self.capacity() - self.len() { + self.reserve_inner(additional); + } + } + + fn reserve_inner(&mut self, additional: usize) { + let Some(required_cap) = self.len().checked_add(additional) else { + capacity_overflow() + }; + + let old_capacity = self.capacity(); + let align = self.align(); + // This guarantees exponential growth. The doubling cannot overflow + // because `cap <= isize::MAX` and the type of `cap` is `usize`. + let cap = cmp::max(old_capacity * 2, required_cap); + + if !is_valid_alloc(cap) { + capacity_overflow() + } + let new_layout = Layout::from_size_align(cap, self.align()).expect("Invalid layout"); + + let old_ptr = self.as_mut_ptr(); + + // SAFETY: old allocation was allocated with std::alloc::alloc with the same layout, + // and we panics on null pointer. + let (ptr, cap) = unsafe { + let old_layout = Layout::from_size_align_unchecked(old_capacity, align); + let ptr = alloc::realloc(old_ptr, old_layout, new_layout.size()); + if ptr.is_null() { + alloc::handle_alloc_error(new_layout); + } + (AlignedBufferPtr(ptr), cap) + }; + + self.ptr = ptr; + self.capacity = cap; + } + + /// Shortens the buffer, keeping the first len bytes. + pub fn truncate(&mut self, len: usize) { + if len > self.len { + return; + } + self.len = len; + } + + /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8]. + pub fn leak<'a>(self) -> &'a mut [u8] { + let mut buf = ManuallyDrop::new(self); + // SAFETY: leaking the buffer as intended. + unsafe { slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.len) } + } +} + +fn capacity_overflow() -> ! { + panic!("capacity overflow") +} + +// We need to guarantee the following: +// * We don't ever allocate `> isize::MAX` byte-size objects. +// * We don't overflow `usize::MAX` and actually allocate too little. +// +// On 64-bit we just need to check for overflow since trying to allocate +// `> isize::MAX` bytes will surely fail. On 32-bit and 16-bit we need to add +// an extra guard for this in case we're running on a platform which can use +// all 4GB in user-space, e.g., PAE or x32. +#[inline] +fn is_valid_alloc(alloc_size: usize) -> bool { + !(usize::BITS < 64 && alloc_size > isize::MAX as usize) +} + +impl Drop for RawAlignedBuffer { + fn drop(&mut self) { + // SAFETY: memory was allocated with std::alloc::alloc with the same layout. + unsafe { + alloc::dealloc( + self.as_mut_ptr(), + Layout::from_size_align_unchecked(self.capacity, self.align.align()), + ) + } + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs new file mode 100644 index 0000000000..6cecf34c1c --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs @@ -0,0 +1,40 @@ +use std::ops::{Deref, DerefMut}; + +use super::alignment::{Alignment, ConstAlign}; + +/// Newtype for an aligned slice. +pub struct AlignedSlice<'a, const N: usize, A: Alignment> { + /// underlying byte slice + buf: &'a mut [u8; N], + /// alignment marker + _align: A, +} + +impl<'a, const N: usize, const A: usize> AlignedSlice<'a, N, ConstAlign> { + /// Create a new aligned slice from a mutable byte slice. The input must already satisify the alignment. + pub unsafe fn new_unchecked(buf: &'a mut [u8; N]) -> Self { + let _align = ConstAlign::; + assert_eq!(buf.as_ptr().align_offset(_align.align()), 0); + AlignedSlice { buf, _align } + } +} + +impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> { + type Target = [u8; N]; + + fn deref(&self) -> &Self::Target { + self.buf + } +} + +impl<'a, const N: usize, A: Alignment> DerefMut for AlignedSlice<'a, N, A> { + fn deref_mut(&mut self) -> &mut Self::Target { + self.buf + } +} + +impl<'a, const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'a, N, A> { + fn as_ref(&self) -> &[u8; N] { + self.buf + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs new file mode 100644 index 0000000000..dba695196e --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs @@ -0,0 +1,9 @@ +use tokio_epoll_uring::IoBufMut; + +use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf}; + +pub trait IoBufAlignedMut: IoBufMut {} + +impl IoBufAlignedMut for IoBufferMut {} + +impl IoBufAlignedMut for PageWriteGuardBuf {} diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs index 7c773b6b21..c3940cf6ce 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -1,5 +1,6 @@ //! See [`FullSlice`]. +use crate::virtual_file::{IoBuffer, IoBufferMut}; use bytes::{Bytes, BytesMut}; use std::ops::{Deref, Range}; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; @@ -76,3 +77,5 @@ macro_rules! impl_io_buf_ext { impl_io_buf_ext!(Bytes); impl_io_buf_ext!(BytesMut); impl_io_buf_ext!(Vec); +impl_io_buf_ext!(IoBufferMut); +impl_io_buf_ext!(IoBuffer); diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 95d1f76920..d3e8bf59f2 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1915,7 +1915,9 @@ impl WalIngest { modification.put_rel_extend(rel, new_nblocks, ctx).await?; let mut key = rel_block_to_key(rel, blknum); + // fill the gap with zeros + let mut gap_blocks_filled: u64 = 0; for gap_blknum in old_nblocks..blknum { key.field6 = gap_blknum; @@ -1924,7 +1926,12 @@ impl WalIngest { } modification.put_rel_page_image_zero(rel, gap_blknum)?; + gap_blocks_filled += 1; } + + WAL_INGEST + .gap_blocks_zeroed_on_rel_extend + .inc_by(gap_blocks_filled); } Ok(()) } diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index facf01004c..c067787f97 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -1,8 +1,7 @@ -use crate::pgdatadir_mapping::AuxFilesDirectory; use crate::walrecord::NeonWalRecord; use anyhow::Context; use byteorder::{ByteOrder, LittleEndian}; -use bytes::{BufMut, BytesMut}; +use bytes::BytesMut; use pageserver_api::key::Key; use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants; @@ -13,7 +12,6 @@ use postgres_ffi::v14::nonrelfile_utils::{ }; use postgres_ffi::BLCKSZ; use tracing::*; -use utils::bin_ser::BeSer; use utils::lsn::Lsn; /// Can this request be served by neon redo functions @@ -236,13 +234,9 @@ pub(crate) fn apply_in_neon( LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid); } } - NeonWalRecord::AuxFile { file_path, content } => { - let mut dir = AuxFilesDirectory::des(page)?; - dir.upsert(file_path.clone(), content.clone()); - - page.clear(); - let mut writer = page.writer(); - dir.ser_into(&mut writer)?; + NeonWalRecord::AuxFile { .. } => { + // No-op: this record will never be created in aux v2. + warn!("AuxFile record should not be created in aux v2"); } #[cfg(test)] NeonWalRecord::Test { @@ -250,6 +244,7 @@ pub(crate) fn apply_in_neon( clear, will_init, } => { + use bytes::BufMut; if *will_init { assert!(*clear, "init record must be clear to ensure correctness"); } @@ -261,59 +256,3 @@ pub(crate) fn apply_in_neon( } Ok(()) } - -#[cfg(test)] -mod test { - use bytes::Bytes; - use pageserver_api::key::AUX_FILES_KEY; - - use super::*; - use std::collections::HashMap; - - /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile - #[test] - fn apply_aux_file_deltas() -> anyhow::Result<()> { - let base_dir = AuxFilesDirectory { - files: HashMap::from([ - ("two".to_string(), Bytes::from_static(b"content0")), - ("three".to_string(), Bytes::from_static(b"contentX")), - ]), - }; - let base_image = AuxFilesDirectory::ser(&base_dir)?; - - let deltas = vec![ - // Insert - NeonWalRecord::AuxFile { - file_path: "one".to_string(), - content: Some(Bytes::from_static(b"content1")), - }, - // Update - NeonWalRecord::AuxFile { - file_path: "two".to_string(), - content: Some(Bytes::from_static(b"content99")), - }, - // Delete - NeonWalRecord::AuxFile { - file_path: "three".to_string(), - content: None, - }, - ]; - - let file_path = AUX_FILES_KEY; - let mut page = BytesMut::from_iter(base_image); - - for record in deltas { - apply_in_neon(&record, Lsn(8), file_path, &mut page)?; - } - - let reconstructed = AuxFilesDirectory::des(&page)?; - let expect = HashMap::from([ - ("one".to_string(), Bytes::from_static(b"content1")), - ("two".to_string(), Bytes::from_static(b"content99")), - ]); - - assert_eq!(reconstructed.files, expect); - - Ok(()) - } -} diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index f1229b2d73..1503b856f7 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -54,7 +54,7 @@ walproposer-lib: libwalproposer.a; .PHONY: libwalproposer.a libwalproposer.a: $(WALPROP_OBJS) - rm -f $@ + $(RM) $@ $(AR) $(AROPT) $@ $^ # needs vars: diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index 0730c305cb..4713103909 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -767,7 +767,7 @@ HandleDropRole(DropRoleStmt *stmt) entry->type = Op_Delete; entry->password = NULL; if (!found) - memset(entry->old_name, 0, sizeof(entry)); + memset(entry->old_name, 0, sizeof(entry->old_name)); } } diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index a3f33cb261..d2a6104c74 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -841,6 +841,23 @@ HandleElectedProposer(WalProposer *wp) wp_log(FATAL, "failed to download WAL for logical replicaiton"); } + /* + * Zero propEpochStartLsn means majority of safekeepers doesn't have any + * WAL, timeline was just created. Compute bumps it to basebackup LSN, + * otherwise we must be sync-safekeepers and we have nothing to do then. + * + * Proceeding is not only pointless but harmful, because we'd give + * safekeepers term history starting with 0/0. These hacks will go away once + * we disable implicit timeline creation on safekeepers and create it with + * non zero LSN from the start. + */ + if (wp->propEpochStartLsn == InvalidXLogRecPtr) + { + Assert(wp->config->syncSafekeepers); + wp_log(LOG, "elected with zero propEpochStartLsn in sync-safekeepers, exiting"); + wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn); + } + if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers) { /* Sync is not needed: just exit */ diff --git a/poetry.lock b/poetry.lock index 00fe2505c9..e307b873f3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1758,85 +1758,101 @@ tests = ["pytest (>=4.6)"] [[package]] name = "multidict" -version = "6.0.4" +version = "6.0.5" description = "multidict implementation" optional = false python-versions = ">=3.7" files = [ - {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"}, - {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"}, - {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"}, - {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"}, - {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"}, - {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"}, - {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"}, - {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"}, - {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"}, - {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"}, - {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"}, - {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"}, - {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"}, - {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"}, - {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"}, - {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"}, - {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"}, - {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"}, - {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"}, - {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"}, - {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"}, - {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"}, - {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"}, - {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"}, - {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"}, - {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"}, - {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"}, - {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"}, - {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"}, - {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"}, - {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"}, - {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"}, - {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"}, - {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"}, - {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"}, - {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"}, - {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"}, - {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"}, - {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"}, - {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"}, - {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"}, - {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"}, - {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"}, - {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"}, - {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"}, - {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"}, - {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"}, - {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"}, - {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"}, - {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"}, - {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"}, - {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"}, - {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"}, - {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"}, - {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"}, - {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"}, - {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"}, - {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"}, - {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"}, - {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"}, - {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"}, - {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"}, - {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"}, - {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"}, - {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"}, - {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"}, - {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"}, - {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"}, - {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"}, - {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"}, - {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"}, - {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"}, - {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"}, - {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"}, + {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9"}, + {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604"}, + {file = "multidict-6.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae"}, + {file = "multidict-6.0.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef"}, + {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc"}, + {file = "multidict-6.0.5-cp310-cp310-win32.whl", hash = "sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319"}, + {file = "multidict-6.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8"}, + {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba"}, + {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e"}, + {file = "multidict-6.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed"}, + {file = "multidict-6.0.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc"}, + {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e"}, + {file = "multidict-6.0.5-cp311-cp311-win32.whl", hash = "sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c"}, + {file = "multidict-6.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea"}, + {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e"}, + {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b"}, + {file = "multidict-6.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226"}, + {file = "multidict-6.0.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6"}, + {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda"}, + {file = "multidict-6.0.5-cp312-cp312-win32.whl", hash = "sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5"}, + {file = "multidict-6.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556"}, + {file = "multidict-6.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626"}, + {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3"}, + {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc"}, + {file = "multidict-6.0.5-cp37-cp37m-win32.whl", hash = "sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee"}, + {file = "multidict-6.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423"}, + {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54"}, + {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d"}, + {file = "multidict-6.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50"}, + {file = "multidict-6.0.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461"}, + {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44"}, + {file = "multidict-6.0.5-cp38-cp38-win32.whl", hash = "sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241"}, + {file = "multidict-6.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c"}, + {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929"}, + {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9"}, + {file = "multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c"}, + {file = "multidict-6.0.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479"}, + {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c"}, + {file = "multidict-6.0.5-cp39-cp39-win32.whl", hash = "sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b"}, + {file = "multidict-6.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755"}, + {file = "multidict-6.0.5-py3-none-any.whl", hash = "sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7"}, + {file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"}, ] [[package]] @@ -2766,28 +2782,29 @@ six = "*" [[package]] name = "ruff" -version = "0.2.2" +version = "0.7.0" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"}, - {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"}, - {file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"}, - {file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"}, - {file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"}, - {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"}, - {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"}, - {file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"}, - {file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"}, - {file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"}, - {file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"}, - {file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"}, - {file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"}, - {file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"}, - {file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"}, - {file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"}, - {file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"}, + {file = "ruff-0.7.0-py3-none-linux_armv6l.whl", hash = "sha256:0cdf20c2b6ff98e37df47b2b0bd3a34aaa155f59a11182c1303cce79be715628"}, + {file = "ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737"}, + {file = "ruff-0.7.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:214b88498684e20b6b2b8852c01d50f0651f3cc6118dfa113b4def9f14faaf06"}, + {file = "ruff-0.7.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:630fce3fefe9844e91ea5bbf7ceadab4f9981f42b704fae011bb8efcaf5d84be"}, + {file = "ruff-0.7.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:211d877674e9373d4bb0f1c80f97a0201c61bcd1e9d045b6e9726adc42c156aa"}, + {file = "ruff-0.7.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:194d6c46c98c73949a106425ed40a576f52291c12bc21399eb8f13a0f7073495"}, + {file = "ruff-0.7.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:82c2579b82b9973a110fab281860403b397c08c403de92de19568f32f7178598"}, + {file = "ruff-0.7.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9af971fe85dcd5eaed8f585ddbc6bdbe8c217fb8fcf510ea6bca5bdfff56040e"}, + {file = "ruff-0.7.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b641c7f16939b7d24b7bfc0be4102c56562a18281f84f635604e8a6989948914"}, + {file = "ruff-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d71672336e46b34e0c90a790afeac8a31954fd42872c1f6adaea1dff76fd44f9"}, + {file = "ruff-0.7.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ab7d98c7eed355166f367597e513a6c82408df4181a937628dbec79abb2a1fe4"}, + {file = "ruff-0.7.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1eb54986f770f49edb14f71d33312d79e00e629a57387382200b1ef12d6a4ef9"}, + {file = "ruff-0.7.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:dc452ba6f2bb9cf8726a84aa877061a2462afe9ae0ea1d411c53d226661c601d"}, + {file = "ruff-0.7.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:4b406c2dce5be9bad59f2de26139a86017a517e6bcd2688da515481c05a2cb11"}, + {file = "ruff-0.7.0-py3-none-win32.whl", hash = "sha256:f6c968509f767776f524a8430426539587d5ec5c662f6addb6aa25bc2e8195ec"}, + {file = "ruff-0.7.0-py3-none-win_amd64.whl", hash = "sha256:ff4aabfbaaba880e85d394603b9e75d32b0693152e16fa659a3064a85df7fce2"}, + {file = "ruff-0.7.0-py3-none-win_arm64.whl", hash = "sha256:10842f69c245e78d6adec7e1db0a7d9ddc2fff0621d730e61657b64fa36f207e"}, + {file = "ruff-0.7.0.tar.gz", hash = "sha256:47a86360cf62d9cd53ebfb0b5eb0e882193fc191c6d717e8bef4462bc3b9ea2b"}, ] [[package]] @@ -3389,4 +3406,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "9055b73352f1534f664cd8af6ebf8d93cf3bf857f115756f312ff2e3ae1bbbc1" +content-hash = "f52632571e34b0e51b059c280c35d6ff6f69f6a8c9586caca78282baf635be91" diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 3f53ee24c3..2185677159 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -16,7 +16,7 @@ use crate::context::RequestMonitoring; use crate::control_plane::errors::GetEndpointJwksError; use crate::http::parse_json_body_with_limit; use crate::intern::RoleNameInt; -use crate::{EndpointId, RoleName}; +use crate::types::{EndpointId, RoleName}; // TODO(conrad): make these configurable. const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30); @@ -669,7 +669,7 @@ mod tests { use tokio::net::TcpListener; use super::*; - use crate::RoleName; + use crate::types::RoleName; fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) { let sk = p256::SecretKey::random(&mut OsRng); diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index e3995ac6c0..f9cb085daf 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -1,23 +1,33 @@ use std::net::SocketAddr; use arc_swap::ArcSwapOption; +use tokio::sync::Semaphore; use super::jwt::{AuthRule, FetchAuthRules}; use crate::auth::backend::jwt::FetchAuthRulesError; use crate::compute::ConnCfg; +use crate::compute_ctl::ComputeCtlApi; use crate::context::RequestMonitoring; use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}; use crate::control_plane::NodeInfo; +use crate::http; use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}; -use crate::EndpointId; +use crate::types::EndpointId; +use crate::url::ApiUrl; pub struct LocalBackend { + pub(crate) initialize: Semaphore, + pub(crate) compute_ctl: ComputeCtlApi, pub(crate) node_info: NodeInfo, } impl LocalBackend { - pub fn new(postgres_addr: SocketAddr) -> Self { + pub fn new(postgres_addr: SocketAddr, compute_ctl: ApiUrl) -> Self { LocalBackend { + initialize: Semaphore::new(1), + compute_ctl: ComputeCtlApi { + api: http::Endpoint::new(compute_ctl, http::new_client()), + }, node_info: NodeInfo { config: { let mut cfg = ConnCfg::new(); diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index a4db130b61..17334b9cbb 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -32,7 +32,8 @@ use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo}; use crate::stream::Stream; -use crate::{scram, stream, EndpointCacheKey, EndpointId, RoleName}; +use crate::types::{EndpointCacheKey, EndpointId, RoleName}; +use crate::{scram, stream}; /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality pub enum MaybeOwned<'a, T> { @@ -551,7 +552,7 @@ mod tests { async fn get_endpoint_jwks( &self, _ctx: &RequestMonitoring, - _endpoint: crate::EndpointId, + _endpoint: crate::types::EndpointId, ) -> Result, control_plane::errors::GetEndpointJwksError> { unimplemented!() diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index fa6bc4c6f5..ddecae6af5 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -15,7 +15,7 @@ use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, SniKind}; use crate::proxy::NeonOptions; use crate::serverless::SERVERLESS_DRIVER_SNI; -use crate::{EndpointId, RoleName}; +use crate::types::{EndpointId, RoleName}; #[derive(Debug, Error, PartialEq, Eq, Clone)] pub(crate) enum ComputeUserInfoParseError { @@ -193,7 +193,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern { D: serde::Deserializer<'de>, { struct StrVisitor; - impl<'de> serde::de::Visitor<'de> for StrVisitor { + impl serde::de::Visitor<'_> for StrVisitor { type Value = IpPattern; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs index 8585b8ff48..b934c28a78 100644 --- a/proxy/src/auth/password_hack.rs +++ b/proxy/src/auth/password_hack.rs @@ -5,7 +5,7 @@ use bstr::ByteSlice; -use crate::EndpointId; +use crate::types::EndpointId; pub(crate) struct PasswordHackPayload { pub(crate) endpoint: EndpointId, diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index e6bc369d9a..df3628465f 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -25,7 +25,8 @@ use proxy::rate_limiter::{ use proxy::scram::threadpool::ThreadPool; use proxy::serverless::cancel_set::CancelSet; use proxy::serverless::{self, GlobalConnPoolOptions}; -use proxy::RoleName; +use proxy::types::RoleName; +use proxy::url::ApiUrl; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); @@ -80,7 +81,10 @@ struct LocalProxyCliArgs { connect_to_compute_retry: String, /// Address of the postgres server #[clap(long, default_value = "127.0.0.1:5432")] - compute: SocketAddr, + postgres: SocketAddr, + /// Address of the compute-ctl api service + #[clap(long, default_value = "http://127.0.0.1:3080/")] + compute_ctl: ApiUrl, /// Path of the local proxy config file #[clap(long, default_value = "./local_proxy.json")] config_path: Utf8PathBuf, @@ -173,7 +177,7 @@ async fn main() -> anyhow::Result<()> { let mut maintenance_tasks = JoinSet::new(); let refresh_config_notify = Arc::new(Notify::new()); - maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), { + maintenance_tasks.spawn(proxy::signals::handle(shutdown.clone(), { let refresh_config_notify = Arc::clone(&refresh_config_notify); move || { refresh_config_notify.notify_one(); @@ -212,7 +216,7 @@ async fn main() -> anyhow::Result<()> { match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await { // exit immediately on maintenance task completion - Either::Left((Some(res), _)) => match proxy::flatten_err(res)? {}, + Either::Left((Some(res), _)) => match proxy::error::flatten_err(res)? {}, // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), // exit immediately on client task error @@ -295,7 +299,7 @@ fn build_auth_backend( args: &LocalProxyCliArgs, ) -> anyhow::Result<&'static auth::Backend<'static, ()>> { let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned( - LocalBackend::new(args.compute), + LocalBackend::new(args.postgres, args.compute_ctl.clone()), )); Ok(Box::leak(Box::new(auth_backend))) diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 00eb830d98..025053d3cb 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -15,6 +15,7 @@ use proxy::context::RequestMonitoring; use proxy::metrics::{Metrics, ThreadPoolMetrics}; use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; use proxy::stream::{PqStream, Stream}; +use rustls::crypto::aws_lc_rs; use rustls::pki_types::PrivateKeyDer; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::TcpListener; @@ -104,10 +105,11 @@ async fn main() -> anyhow::Result<()> { let first_cert = cert_chain.first().context("missing certificate")?; let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - let tls_config = rustls::ServerConfig::builder_with_protocol_versions(&[ - &rustls::version::TLS13, - &rustls::version::TLS12, - ]) + let tls_config = rustls::ServerConfig::builder_with_provider(Arc::new( + aws_lc_rs::default_provider(), + )) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .context("aws_lc_rs should support TLS1.2 and TLS1.3")? .with_no_client_auth() .with_single_cert(cert_chain, key)? .into(); @@ -131,14 +133,14 @@ async fn main() -> anyhow::Result<()> { proxy_listener, cancellation_token.clone(), )); - let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || {})); + let signals_task = tokio::spawn(proxy::signals::handle(cancellation_token, || {})); // the signal task cant ever succeed. // the main task can error, or can succeed on cancellation. // we want to immediately exit on either of these cases let signal = match futures::future::select(signals_task, main).await { - Either::Left((res, _)) => proxy::flatten_err(res)?, - Either::Right((res, _)) => return proxy::flatten_err(res), + Either::Left((res, _)) => proxy::error::flatten_err(res)?, + Either::Right((res, _)) => return proxy::error::flatten_err(res), }; // maintenance tasks return `Infallible` success values, this is an impossible value diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 96a71e69c6..6e190029aa 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -495,7 +495,7 @@ async fn main() -> anyhow::Result<()> { // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone(), || {})); + maintenance_tasks.spawn(proxy::signals::handle(cancellation_token.clone(), || {})); maintenance_tasks.spawn(http::health_server::task_main( http_listener, AppMetrics { @@ -561,11 +561,11 @@ async fn main() -> anyhow::Result<()> { .await { // exit immediately on maintenance task completion - Either::Left((Some(res), _)) => break proxy::flatten_err(res)?, + Either::Left((Some(res), _)) => break proxy::error::flatten_err(res)?, // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), // exit immediately on client task error - Either::Right((Some(res), _)) => proxy::flatten_err(res)?, + Either::Right((Some(res), _)) => proxy::error::flatten_err(res)?, // exit if all our client tasks have shutdown gracefully Either::Right((None, _)) => return Ok(()), } diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 82f3247fa7..12c33169bf 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -17,7 +17,7 @@ use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; use crate::rate_limiter::GlobalRateLimiter; use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use crate::EndpointId; +use crate::types::EndpointId; #[derive(Deserialize, Debug, Clone)] pub(crate) struct ControlPlaneEventKey { diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 31d1dc96e7..84430dc812 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -17,7 +17,7 @@ use crate::auth::IpPattern; use crate::config::ProjectInfoCacheOptions; use crate::control_plane::AuthSecret; use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}; -use crate::{EndpointId, RoleName}; +use crate::types::{EndpointId, RoleName}; #[async_trait] pub(crate) trait ProjectInfoCache { @@ -368,7 +368,7 @@ impl Cache for ProjectInfoCacheImpl { mod tests { use super::*; use crate::scram::ServerSecret; - use crate::ProjectId; + use crate::types::ProjectId; #[tokio::test] async fn test_project_info_cache_settings() { diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 212e82497f..b97942ee5d 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -8,6 +8,7 @@ use itertools::Itertools; use once_cell::sync::OnceCell; use pq_proto::StartupMessageParams; use rustls::client::danger::ServerCertVerifier; +use rustls::crypto::aws_lc_rs; use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; use tokio::net::TcpStream; @@ -24,7 +25,7 @@ use crate::control_plane::provider::ApiLockError; use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, NumDbConnectionsGuard}; use crate::proxy::neon_option; -use crate::Host; +use crate::types::Host; pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; @@ -38,6 +39,9 @@ pub(crate) enum ConnectionError { #[error("{COULD_NOT_CONNECT}: {0}")] CouldNotConnect(#[from] io::Error), + #[error("Couldn't load native TLS certificates: {0:?}")] + TlsCertificateError(Vec), + #[error("{COULD_NOT_CONNECT}: {0}")] TlsError(#[from] InvalidDnsNameError), @@ -84,6 +88,7 @@ impl ReportableError for ConnectionError { } ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute, ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute, + ConnectionError::TlsCertificateError(_) => crate::error::ErrorKind::Service, ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute, ConnectionError::WakeComputeError(e) => e.get_error_kind(), ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(), @@ -293,12 +298,20 @@ impl ConnCfg { let client_config = if allow_self_signed_compute { // Allow all certificates for creating the connection let verifier = Arc::new(AcceptEverythingVerifier); - rustls::ClientConfig::builder() + rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .expect("aws_lc_rs should support the default protocol versions") .dangerous() .with_custom_certificate_verifier(verifier) } else { - let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone(); - rustls::ClientConfig::builder().with_root_certificates(root_store) + let root_store = TLS_ROOTS + .get_or_try_init(load_certs) + .map_err(ConnectionError::TlsCertificateError)? + .clone(); + rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .expect("aws_lc_rs should support the default protocol versions") + .with_root_certificates(root_store) }; let client_config = client_config.with_no_client_auth(); @@ -359,10 +372,15 @@ fn filtered_options(params: &StartupMessageParams) -> Option { Some(options) } -fn load_certs() -> Result, io::Error> { - let der_certs = rustls_native_certs::load_native_certs()?; +fn load_certs() -> Result, Vec> { + let der_certs = rustls_native_certs::load_native_certs(); + + if !der_certs.errors.is_empty() { + return Err(der_certs.errors); + } + let mut store = rustls::RootCertStore::empty(); - store.add_parsable_certificates(der_certs); + store.add_parsable_certificates(der_certs.certs); Ok(Arc::new(store)) } static TLS_ROOTS: OnceCell> = OnceCell::new(); diff --git a/proxy/src/compute_ctl/mod.rs b/proxy/src/compute_ctl/mod.rs new file mode 100644 index 0000000000..60fdf107d4 --- /dev/null +++ b/proxy/src/compute_ctl/mod.rs @@ -0,0 +1,102 @@ +use compute_api::responses::GenericAPIError; +use hyper::{Method, StatusCode}; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +use crate::http; +use crate::types::{DbName, RoleName}; +use crate::url::ApiUrl; + +pub struct ComputeCtlApi { + pub(crate) api: http::Endpoint, +} + +#[derive(Serialize, Debug)] +pub struct ExtensionInstallRequest { + pub extension: &'static str, + pub database: DbName, + pub version: &'static str, +} + +#[derive(Serialize, Debug)] +pub struct SetRoleGrantsRequest { + pub database: DbName, + pub schema: &'static str, + pub privileges: Vec, + pub role: RoleName, +} + +#[derive(Clone, Debug, Deserialize)] +pub struct ExtensionInstallResponse {} + +#[derive(Clone, Debug, Deserialize)] +pub struct SetRoleGrantsResponse {} + +#[derive(Debug, Serialize, Deserialize, Clone, Copy)] +#[serde(rename_all = "UPPERCASE")] +pub enum Privilege { + Usage, +} + +#[derive(Error, Debug)] +pub enum ComputeCtlError { + #[error("connection error: {0}")] + ConnectionError(#[source] reqwest_middleware::Error), + #[error("request error [{status}]: {body:?}")] + RequestError { + status: StatusCode, + body: Option, + }, + #[error("response parsing error: {0}")] + ResponseError(#[source] reqwest::Error), +} + +impl ComputeCtlApi { + pub async fn install_extension( + &self, + req: &ExtensionInstallRequest, + ) -> Result { + self.generic_request(req, Method::POST, |url| { + url.path_segments_mut().push("extensions"); + }) + .await + } + + pub async fn grant_role( + &self, + req: &SetRoleGrantsRequest, + ) -> Result { + self.generic_request(req, Method::POST, |url| { + url.path_segments_mut().push("grants"); + }) + .await + } + + async fn generic_request( + &self, + req: &Req, + method: Method, + url: impl for<'a> FnOnce(&'a mut ApiUrl), + ) -> Result + where + Req: Serialize, + Resp: DeserializeOwned, + { + let resp = self + .api + .request_with_url(method, url) + .json(req) + .send() + .await + .map_err(ComputeCtlError::ConnectionError)?; + + let status = resp.status(); + if status.is_client_error() || status.is_server_error() { + let body = resp.json().await.ok(); + return Err(ComputeCtlError::RequestError { status, body }); + } + + resp.json().await.map_err(ComputeCtlError::ResponseError) + } +} diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 2ec8c7adda..5183f22fa3 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -7,7 +7,7 @@ use anyhow::{bail, ensure, Context, Ok}; use clap::ValueEnum; use itertools::Itertools; use remote_storage::RemoteStorageConfig; -use rustls::crypto::ring::sign; +use rustls::crypto::aws_lc_rs::{self, sign}; use rustls::pki_types::{CertificateDer, PrivateKeyDer}; use sha2::{Digest, Sha256}; use tracing::{error, info}; @@ -20,7 +20,7 @@ use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig} use crate::scram::threadpool::ThreadPool; use crate::serverless::cancel_set::CancelSet; use crate::serverless::GlobalConnPoolOptions; -use crate::Host; +use crate::types::Host; pub struct ProxyConfig { pub tls_config: Option, @@ -126,12 +126,12 @@ pub fn configure_tls( let cert_resolver = Arc::new(cert_resolver); // allow TLS 1.2 to be compatible with older client libraries - let mut config = rustls::ServerConfig::builder_with_protocol_versions(&[ - &rustls::version::TLS13, - &rustls::version::TLS12, - ]) - .with_no_client_auth() - .with_cert_resolver(cert_resolver.clone()); + let mut config = + rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .context("aws_lc_rs should support TLS1.2 and TLS1.3")? + .with_no_client_auth() + .with_cert_resolver(cert_resolver.clone()); config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()]; @@ -558,7 +558,7 @@ pub struct RetryConfig { } impl RetryConfig { - /// Default options for RetryConfig. + // Default options for RetryConfig. /// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s. pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str = diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index e2d2c1b766..ca3b808a1b 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -19,7 +19,7 @@ use crate::intern::{BranchIdInt, ProjectIdInt}; use crate::metrics::{ ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting, }; -use crate::{DbName, EndpointId, RoleName}; +use crate::types::{DbName, EndpointId, RoleName}; pub mod parquet; diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index b0ad0e4566..3432ac5ff6 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -104,7 +104,7 @@ struct Options<'a> { options: &'a StartupMessageParams, } -impl<'a> serde::Serialize for Options<'a> { +impl serde::Serialize for Options<'_> { fn serialize(&self, s: S) -> Result where S: serde::Serializer, diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index dae23f7c53..13a54145b1 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -161,6 +161,9 @@ pub(crate) enum Reason { /// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken. #[serde(rename = "LOCK_ALREADY_TAKEN")] LockAlreadyTaken, + /// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded. + #[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")] + ActiveEndpointsLimitExceeded, #[default] #[serde(other)] Unknown, @@ -194,7 +197,8 @@ impl Reason { | Reason::ComputeTimeQuotaExceeded | Reason::WrittenDataQuotaExceeded | Reason::DataTransferQuotaExceeded - | Reason::LogicalSizeQuotaExceeded => false, + | Reason::LogicalSizeQuotaExceeded + | Reason::ActiveEndpointsLimitExceeded => false, // transitive error. control plane is currently busy // but might be ready soon Reason::RunningOperations diff --git a/proxy/src/control_plane/provider/mock.rs b/proxy/src/control_plane/provider/mock.rs index fb061376e7..75a242d8d3 100644 --- a/proxy/src/control_plane/provider/mock.rs +++ b/proxy/src/control_plane/provider/mock.rs @@ -21,8 +21,9 @@ use crate::control_plane::messages::MetricsAuxInfo; use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret}; use crate::error::io_error; use crate::intern::RoleNameInt; +use crate::types::{BranchId, EndpointId, ProjectId, RoleName}; use crate::url::ApiUrl; -use crate::{compute, scram, BranchId, EndpointId, ProjectId, RoleName}; +use crate::{compute, scram}; #[derive(Debug, Error)] enum MockApiError { diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs index a4a330cd5f..49e57b6b7e 100644 --- a/proxy/src/control_plane/provider/mod.rs +++ b/proxy/src/control_plane/provider/mod.rs @@ -23,7 +23,8 @@ use crate::error::ReportableError; use crate::intern::ProjectIdInt; use crate::metrics::ApiLockMetrics; use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}; -use crate::{compute, scram, EndpointCacheKey, EndpointId}; +use crate::types::{EndpointCacheKey, EndpointId}; +use crate::{compute, scram}; pub(crate) mod errors { use thiserror::Error; @@ -87,36 +88,8 @@ pub(crate) mod errors { Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane, Reason::LockAlreadyTaken => ErrorKind::ControlPlane, Reason::RunningOperations => ErrorKind::ControlPlane, - Reason::Unknown => match &**e { - ControlPlaneError { - http_status_code: - http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE, - .. - } => crate::error::ErrorKind::User, - ControlPlaneError { - http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY, - error, - .. - } if error - .contains("compute time quota of non-primary branches is exceeded") => - { - crate::error::ErrorKind::Quota - } - ControlPlaneError { - http_status_code: http::StatusCode::LOCKED, - error, - .. - } if error.contains("quota exceeded") - || error.contains("the limit for current plan reached") => - { - crate::error::ErrorKind::Quota - } - ControlPlaneError { - http_status_code: http::StatusCode::TOO_MANY_REQUESTS, - .. - } => crate::error::ErrorKind::ServiceRateLimit, - ControlPlaneError { .. } => crate::error::ErrorKind::ControlPlane, - }, + Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane, + Reason::Unknown => ErrorKind::ControlPlane, }, ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane, } diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/provider/neon.rs index 5d0692c7ca..8ea91d7875 100644 --- a/proxy/src/control_plane/provider/neon.rs +++ b/proxy/src/control_plane/provider/neon.rs @@ -24,7 +24,8 @@ use crate::control_plane::errors::GetEndpointJwksError; use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}; use crate::metrics::{CacheOutcome, Metrics}; use crate::rate_limiter::WakeComputeRateLimiter; -use crate::{compute, http, scram, EndpointCacheKey, EndpointId}; +use crate::types::{EndpointCacheKey, EndpointId}; +use crate::{compute, http, scram}; const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); diff --git a/proxy/src/error.rs b/proxy/src/error.rs index e71ed0c048..7b693a7418 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -1,7 +1,9 @@ use std::error::Error as StdError; use std::{fmt, io}; +use anyhow::Context; use measured::FixedCardinalityLabel; +use tokio::task::JoinError; /// Upcast (almost) any error into an opaque [`io::Error`]. pub(crate) fn io_error(e: impl Into>) -> io::Error { @@ -97,3 +99,8 @@ impl ReportableError for tokio_postgres::error::Error { } } } + +/// Flattens `Result>` into `Result`. +pub fn flatten_err(r: Result, JoinError>) -> anyhow::Result { + r.context("join error").and_then(|x| x) +} diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs index fd587e8f01..f1b632e704 100644 --- a/proxy/src/http/mod.rs +++ b/proxy/src/http/mod.rs @@ -8,6 +8,7 @@ use std::time::Duration; use anyhow::bail; use bytes::Bytes; +use http::Method; use http_body_util::BodyExt; use hyper::body::Body; pub(crate) use reqwest::{Request, Response}; @@ -93,9 +94,19 @@ impl Endpoint { /// Return a [builder](RequestBuilder) for a `GET` request, /// accepting a closure to modify the url path segments for more complex paths queries. pub(crate) fn get_with_url(&self, f: impl for<'a> FnOnce(&'a mut ApiUrl)) -> RequestBuilder { + self.request_with_url(Method::GET, f) + } + + /// Return a [builder](RequestBuilder) for a request, + /// accepting a closure to modify the url path segments for more complex paths queries. + pub(crate) fn request_with_url( + &self, + method: Method, + f: impl for<'a> FnOnce(&'a mut ApiUrl), + ) -> RequestBuilder { let mut url = self.endpoint.clone(); f(&mut url); - self.client.get(url.into_inner()) + self.client.request(method, url.into_inner()) } /// Execute a [request](reqwest::Request). diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index 09fd9657d0..f56d92a6b3 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -7,7 +7,7 @@ use std::sync::OnceLock; use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo}; use rustc_hash::FxHasher; -use crate::{BranchId, EndpointId, ProjectId, RoleName}; +use crate::types::{BranchId, EndpointId, ProjectId, RoleName}; pub trait InternId: Sized + 'static { fn get_interner() -> &'static StringInterner; @@ -55,7 +55,7 @@ impl std::ops::Deref for InternedString { impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString { fn deserialize>(d: D) -> Result { struct Visitor(PhantomData); - impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor { + impl serde::de::Visitor<'_> for Visitor { type Value = InternedString; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 74bc778a36..f95d645c23 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -76,24 +76,13 @@ ) )] // List of temporarily allowed lints to unblock beta/nightly. -#![allow( - unknown_lints, - // TODO: 1.82: Add `use` where necessary and remove from this list. - impl_trait_overcaptures, -)] - -use std::convert::Infallible; - -use anyhow::{bail, Context}; -use intern::{EndpointIdInt, EndpointIdTag, InternId}; -use tokio::task::JoinError; -use tokio_util::sync::CancellationToken; -use tracing::warn; +#![allow(unknown_lints)] pub mod auth; pub mod cache; pub mod cancellation; pub mod compute; +pub mod compute_ctl; pub mod config; pub mod console_redirect_proxy; pub mod context; @@ -112,165 +101,9 @@ pub mod redis; pub mod sasl; pub mod scram; pub mod serverless; +pub mod signals; pub mod stream; +pub mod types; pub mod url; pub mod usage_metrics; pub mod waiters; - -/// Handle unix signals appropriately. -pub async fn handle_signals( - token: CancellationToken, - mut refresh_config: F, -) -> anyhow::Result -where - F: FnMut(), -{ - use tokio::signal::unix::{signal, SignalKind}; - - let mut hangup = signal(SignalKind::hangup())?; - let mut interrupt = signal(SignalKind::interrupt())?; - let mut terminate = signal(SignalKind::terminate())?; - - loop { - tokio::select! { - // Hangup is commonly used for config reload. - _ = hangup.recv() => { - warn!("received SIGHUP"); - refresh_config(); - } - // Shut down the whole application. - _ = interrupt.recv() => { - warn!("received SIGINT, exiting immediately"); - bail!("interrupted"); - } - _ = terminate.recv() => { - warn!("received SIGTERM, shutting down once all existing connections have closed"); - token.cancel(); - } - } - } -} - -/// Flattens `Result>` into `Result`. -pub fn flatten_err(r: Result, JoinError>) -> anyhow::Result { - r.context("join error").and_then(|x| x) -} - -macro_rules! smol_str_wrapper { - ($name:ident) => { - #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] - pub struct $name(smol_str::SmolStr); - - impl $name { - #[allow(unused)] - pub(crate) fn as_str(&self) -> &str { - self.0.as_str() - } - } - - impl std::fmt::Display for $name { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.0.fmt(f) - } - } - - impl std::cmp::PartialEq for $name - where - smol_str::SmolStr: std::cmp::PartialEq, - { - fn eq(&self, other: &T) -> bool { - self.0.eq(other) - } - } - - impl From for $name - where - smol_str::SmolStr: From, - { - fn from(x: T) -> Self { - Self(x.into()) - } - } - - impl AsRef for $name { - fn as_ref(&self) -> &str { - self.0.as_ref() - } - } - - impl std::ops::Deref for $name { - type Target = str; - fn deref(&self) -> &str { - &*self.0 - } - } - - impl<'de> serde::de::Deserialize<'de> for $name { - fn deserialize>(d: D) -> Result { - >::deserialize(d).map(Self) - } - } - - impl serde::Serialize for $name { - fn serialize(&self, s: S) -> Result { - self.0.serialize(s) - } - } - }; -} - -const POOLER_SUFFIX: &str = "-pooler"; - -impl EndpointId { - fn normalize(&self) -> Self { - if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) { - stripped.into() - } else { - self.clone() - } - } - - fn normalize_intern(&self) -> EndpointIdInt { - if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) { - EndpointIdTag::get_interner().get_or_intern(stripped) - } else { - self.into() - } - } -} - -// 90% of role name strings are 20 characters or less. -smol_str_wrapper!(RoleName); -// 50% of endpoint strings are 23 characters or less. -smol_str_wrapper!(EndpointId); -// 50% of branch strings are 23 characters or less. -smol_str_wrapper!(BranchId); -// 90% of project strings are 23 characters or less. -smol_str_wrapper!(ProjectId); - -// will usually equal endpoint ID -smol_str_wrapper!(EndpointCacheKey); - -smol_str_wrapper!(DbName); - -// postgres hostname, will likely be a port:ip addr -smol_str_wrapper!(Host); - -// Endpoints are a bit tricky. Rare they might be branches or projects. -impl EndpointId { - pub(crate) fn is_endpoint(&self) -> bool { - self.0.starts_with("ep-") - } - pub(crate) fn is_branch(&self) -> bool { - self.0.starts_with("br-") - } - // pub(crate) fn is_project(&self) -> bool { - // !self.is_endpoint() && !self.is_branch() - // } - pub(crate) fn as_branch(&self) -> BranchId { - BranchId(self.0.clone()) - } - pub(crate) fn as_project(&self) -> ProjectId { - ProjectId(self.0.clone()) - } -} diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 542826e833..f91fcd4120 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -14,6 +14,7 @@ use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; use tokio::time::{self, Instant}; use crate::control_plane::messages::ColdStartInfo; +use crate::error::ErrorKind; #[derive(MetricGroup)] #[metric(new(thread_pool: Arc))] @@ -325,23 +326,10 @@ pub enum ConnectionFailureKind { ComputeUncached, } -#[derive(FixedCardinalityLabel, Copy, Clone)] -#[label(singleton = "kind")] -pub enum WakeupFailureKind { - BadComputeAddress, - ApiTransportError, - QuotaExceeded, - ApiConsoleLocked, - ApiConsoleBadRequest, - ApiConsoleOtherServerError, - ApiConsoleOtherError, - TimeoutError, -} - #[derive(LabelGroup)] #[label(set = ConnectionFailuresBreakdownSet)] pub struct ConnectionFailuresBreakdownGroup { - pub kind: WakeupFailureKind, + pub kind: ErrorKind, pub retry: Bool, } diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 8e9663626a..659b7afa68 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -17,7 +17,7 @@ use crate::metrics::{ }; use crate::proxy::retry::{retry_after, should_retry, CouldRetry}; use crate::proxy::wake_compute::wake_compute; -use crate::Host; +use crate::types::Host; const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index f646862caa..2970d93393 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -32,7 +32,8 @@ use crate::protocol2::read_proxy_protocol; use crate::proxy::handshake::{handshake, HandshakeData}; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::{PqStream, Stream}; -use crate::{auth, compute, EndpointCacheKey}; +use crate::types::EndpointCacheKey; +use crate::{auth, compute}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index e50ae4bc93..fe62fee204 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -9,6 +9,7 @@ use async_trait::async_trait; use http::StatusCode; use retry::{retry_after, ShouldRetryWakeCompute}; use rstest::rstest; +use rustls::crypto::aws_lc_rs; use rustls::pki_types; use tokio_postgres::config::SslMode; use tokio_postgres::tls::{MakeTlsConnect, NoTls}; @@ -27,7 +28,8 @@ use crate::control_plane::provider::{ }; use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::ErrorKind; -use crate::{sasl, scram, BranchId, EndpointId, ProjectId}; +use crate::types::{BranchId, EndpointId, ProjectId}; +use crate::{sasl, scram}; /// Generate a set of TLS certificates: CA + server. fn generate_certs( @@ -38,25 +40,27 @@ fn generate_certs( pki_types::CertificateDer<'static>, pki_types::PrivateKeyDer<'static>, )> { - let ca = rcgen::Certificate::from_params({ + let ca_key = rcgen::KeyPair::generate()?; + let ca = { let mut params = rcgen::CertificateParams::default(); params.is_ca = rcgen::IsCa::Ca(rcgen::BasicConstraints::Unconstrained); - params - })?; + params.self_signed(&ca_key)? + }; - let cert = rcgen::Certificate::from_params({ - let mut params = rcgen::CertificateParams::new(vec![hostname.into()]); + let cert_key = rcgen::KeyPair::generate()?; + let cert = { + let mut params = rcgen::CertificateParams::new(vec![hostname.into()])?; params.distinguished_name = rcgen::DistinguishedName::new(); params .distinguished_name .push(rcgen::DnType::CommonName, common_name); - params - })?; + params.signed_by(&cert_key, &ca, &ca_key)? + }; Ok(( - pki_types::CertificateDer::from(ca.serialize_der()?), - pki_types::CertificateDer::from(cert.serialize_der_with_signer(&ca)?), - pki_types::PrivateKeyDer::Pkcs8(cert.serialize_private_key_der().into()), + ca.der().clone(), + cert.der().clone(), + pki_types::PrivateKeyDer::Pkcs8(cert_key.serialize_der().into()), )) } @@ -70,11 +74,11 @@ impl ClientConfig<'_> { self, ) -> anyhow::Result< impl tokio_postgres::tls::TlsConnect< - S, - Error = impl std::fmt::Debug, - Future = impl Send, - Stream = RustlsStream, - >, + S, + Error = impl std::fmt::Debug + use, + Future = impl Send + use, + Stream = RustlsStream, + > + use, > { let mut mk = MakeRustlsConnect::new(self.config); let tls = MakeTlsConnect::::make_tls_connect(&mut mk, self.hostname)?; @@ -90,10 +94,13 @@ fn generate_tls_config<'a>( let (ca, cert, key) = generate_certs(hostname, common_name)?; let tls_config = { - let config = rustls::ServerConfig::builder() - .with_no_client_auth() - .with_single_cert(vec![cert.clone()], key.clone_key())? - .into(); + let config = + rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .context("aws_lc_rs should support the default protocol versions")? + .with_no_client_auth() + .with_single_cert(vec![cert.clone()], key.clone_key())? + .into(); let mut cert_resolver = CertResolver::new(); cert_resolver.add_cert(key, vec![cert], true)?; @@ -108,13 +115,16 @@ fn generate_tls_config<'a>( }; let client_config = { - let config = rustls::ClientConfig::builder() - .with_root_certificates({ - let mut store = rustls::RootCertStore::empty(); - store.add(ca)?; - store - }) - .with_no_client_auth(); + let config = + rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .context("aws_lc_rs should support the default protocol versions")? + .with_root_certificates({ + let mut store = rustls::RootCertStore::empty(); + store.add(ca)?; + store + }) + .with_no_client_auth(); ClientConfig { config, hostname } }; diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index 9dfa485fa4..4e61094264 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,15 +1,13 @@ -use hyper::StatusCode; use tracing::{error, info, warn}; use super::connect_compute::ComputeConnectBackend; use crate::config::RetryConfig; use crate::context::RequestMonitoring; use crate::control_plane::errors::WakeComputeError; -use crate::control_plane::messages::{ControlPlaneError, Reason}; use crate::control_plane::provider::CachedNodeInfo; +use crate::error::ReportableError; use crate::metrics::{ ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, - WakeupFailureKind, }; use crate::proxy::retry::{retry_after, should_retry}; @@ -60,62 +58,8 @@ pub(crate) async fn wake_compute( } fn report_error(e: &WakeComputeError, retry: bool) { - use crate::control_plane::errors::ApiError; - let kind = match e { - WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress, - WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError, - WakeComputeError::ApiError(ApiError::ControlPlane(e)) => match e.get_reason() { - Reason::RoleProtected => WakeupFailureKind::ApiConsoleBadRequest, - Reason::ResourceNotFound => WakeupFailureKind::ApiConsoleBadRequest, - Reason::ProjectNotFound => WakeupFailureKind::ApiConsoleBadRequest, - Reason::EndpointNotFound => WakeupFailureKind::ApiConsoleBadRequest, - Reason::BranchNotFound => WakeupFailureKind::ApiConsoleBadRequest, - Reason::RateLimitExceeded => WakeupFailureKind::ApiConsoleLocked, - Reason::NonDefaultBranchComputeTimeExceeded => WakeupFailureKind::QuotaExceeded, - Reason::ActiveTimeQuotaExceeded => WakeupFailureKind::QuotaExceeded, - Reason::ComputeTimeQuotaExceeded => WakeupFailureKind::QuotaExceeded, - Reason::WrittenDataQuotaExceeded => WakeupFailureKind::QuotaExceeded, - Reason::DataTransferQuotaExceeded => WakeupFailureKind::QuotaExceeded, - Reason::LogicalSizeQuotaExceeded => WakeupFailureKind::QuotaExceeded, - Reason::ConcurrencyLimitReached => WakeupFailureKind::ApiConsoleLocked, - Reason::LockAlreadyTaken => WakeupFailureKind::ApiConsoleLocked, - Reason::RunningOperations => WakeupFailureKind::ApiConsoleLocked, - Reason::Unknown => match **e { - ControlPlaneError { - http_status_code: StatusCode::LOCKED, - ref error, - .. - } if error.contains("written data quota exceeded") - || error.contains("the limit for current plan reached") => - { - WakeupFailureKind::QuotaExceeded - } - ControlPlaneError { - http_status_code: StatusCode::UNPROCESSABLE_ENTITY, - ref error, - .. - } if error.contains("compute time quota of non-primary branches is exceeded") => { - WakeupFailureKind::QuotaExceeded - } - ControlPlaneError { - http_status_code: StatusCode::LOCKED, - .. - } => WakeupFailureKind::ApiConsoleLocked, - ControlPlaneError { - http_status_code: StatusCode::BAD_REQUEST, - .. - } => WakeupFailureKind::ApiConsoleBadRequest, - ControlPlaneError { - http_status_code, .. - } if http_status_code.is_server_error() => { - WakeupFailureKind::ApiConsoleOtherServerError - } - ControlPlaneError { .. } => WakeupFailureKind::ApiConsoleOtherError, - }, - }, - WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked, - WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError, - }; + let kind = e.get_error_kind(); + Metrics::get() .proxy .connection_failures_breakdown diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 5de64c2254..4259fd04f4 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -250,7 +250,7 @@ mod tests { use super::{BucketRateLimiter, WakeComputeRateLimiter}; use crate::intern::EndpointIdInt; use crate::rate_limiter::RateBucketInfo; - use crate::EndpointId; + use crate::types::EndpointId; #[test] fn rate_bucket_rpi() { diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index e56c5a3414..62e7b1b565 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -271,7 +271,7 @@ mod tests { use serde_json::json; use super::*; - use crate::{ProjectId, RoleName}; + use crate::types::{ProjectId, RoleName}; #[test] fn parse_allowed_ips() -> anyhow::Result<()> { diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 493295c938..6a13f645a5 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -218,16 +218,12 @@ impl sasl::Mechanism for Exchange<'_> { self.state = ExchangeState::SaltSent(sent); Ok(Step::Continue(self, msg)) } - #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match - Step::Success(x, _) => match x {}, Step::Failure(msg) => Ok(Step::Failure(msg)), } } ExchangeState::SaltSent(sent) => { match sent.transition(self.secret, &self.tls_server_end_point, input)? { Step::Success(keys, msg) => Ok(Step::Success(keys, msg)), - #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match - Step::Continue(x, _) => match x {}, Step::Failure(msg) => Ok(Step::Failure(msg)), } } diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs index 97644b6282..718445f61d 100644 --- a/proxy/src/scram/mod.rs +++ b/proxy/src/scram/mod.rs @@ -62,7 +62,7 @@ mod tests { use super::{Exchange, ServerSecret}; use crate::intern::EndpointIdInt; use crate::sasl::{Mechanism, Step}; - use crate::EndpointId; + use crate::types::EndpointId; #[test] fn snapshot() { diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index cc1b69fcf9..ebc6dd2a3c 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -189,7 +189,7 @@ impl Drop for JobHandle { #[cfg(test)] mod tests { use super::*; - use crate::EndpointId; + use crate::types::EndpointId; #[tokio::test] async fn hash_is_correct() { diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index a180c4c2ed..07e0e30148 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -11,12 +11,17 @@ use tokio::net::{lookup_host, TcpStream}; use tracing::field::display; use tracing::{debug, info}; -use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; -use super::http_conn_pool::{self, poll_http2_client}; -use super::local_conn_pool::{self, LocalClient, LocalConnPool}; +use super::conn_pool::poll_client; +use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool}; +use super::http_conn_pool::{self, poll_http2_client, Send}; +use super::local_conn_pool::{self, LocalClient, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION}; use crate::auth::backend::local::StaticAuthRules; use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; use crate::auth::{self, check_peer_addr_is_in_list, AuthError}; +use crate::compute; +use crate::compute_ctl::{ + ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest, +}; use crate::config::ProxyConfig; use crate::context::RequestMonitoring; use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; @@ -28,12 +33,13 @@ use crate::intern::EndpointIdInt; use crate::proxy::connect_compute::ConnectMechanism; use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute}; use crate::rate_limiter::EndpointRateLimiter; -use crate::{compute, EndpointId, Host}; +use crate::types::{EndpointId, Host}; pub(crate) struct PoolingBackend { - pub(crate) http_conn_pool: Arc, + pub(crate) http_conn_pool: Arc>, pub(crate) local_pool: Arc>, pub(crate) pool: Arc>, + pub(crate) config: &'static ProxyConfig, pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>, pub(crate) endpoint_rate_limiter: Arc, @@ -199,7 +205,7 @@ impl PoolingBackend { &self, ctx: &RequestMonitoring, conn_info: ConnInfo, - ) -> Result { + ) -> Result, HttpConnError> { info!("pool: looking for an existing connection"); if let Some(client) = self.http_conn_pool.get(ctx, &conn_info) { return Ok(client); @@ -249,16 +255,47 @@ impl PoolingBackend { return Ok(client); } + let local_backend = match &self.auth_backend { + auth::Backend::ControlPlane(_, ()) => { + unreachable!("only local_proxy can connect to local postgres") + } + auth::Backend::Local(local) => local, + }; + + if !self.local_pool.initialized(&conn_info) { + // only install and grant usage one at a time. + let _permit = local_backend.initialize.acquire().await.unwrap(); + + // check again for race + if !self.local_pool.initialized(&conn_info) { + local_backend + .compute_ctl + .install_extension(&ExtensionInstallRequest { + extension: EXT_NAME, + database: conn_info.dbname.clone(), + version: EXT_VERSION, + }) + .await?; + + local_backend + .compute_ctl + .grant_role(&SetRoleGrantsRequest { + schema: EXT_SCHEMA, + privileges: vec![Privilege::Usage], + database: conn_info.dbname.clone(), + role: conn_info.user_info.user.clone(), + }) + .await?; + + self.local_pool.set_initialized(&conn_info); + } + } + let conn_id = uuid::Uuid::new_v4(); tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "local_pool: opening a new connection '{conn_info}'"); - let mut node_info = match &self.auth_backend { - auth::Backend::ControlPlane(_, ()) => { - unreachable!("only local_proxy can connect to local postgres") - } - auth::Backend::Local(local) => local.node_info.clone(), - }; + let mut node_info = local_backend.node_info.clone(); let (key, jwk) = create_random_jwk(); @@ -323,6 +360,8 @@ pub(crate) enum HttpConnError { #[error("could not parse JWT payload")] JwtPayloadError(serde_json::Error), + #[error("could not install extension: {0}")] + ComputeCtl(#[from] ComputeCtlError), #[error("could not get auth info")] GetAuthInfo(#[from] GetAuthInfoError), #[error("user not authenticated")] @@ -347,6 +386,7 @@ impl ReportableError for HttpConnError { HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, HttpConnError::PostgresConnectionError(p) => p.get_error_kind(), HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute, + HttpConnError::ComputeCtl(_) => ErrorKind::Service, HttpConnError::JwtPayloadError(_) => ErrorKind::User, HttpConnError::GetAuthInfo(a) => a.get_error_kind(), HttpConnError::AuthError(a) => a.get_error_kind(), @@ -362,6 +402,7 @@ impl UserFacingError for HttpConnError { HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(), HttpConnError::PostgresConnectionError(p) => p.to_string(), HttpConnError::LocalProxyConnectionError(p) => p.to_string(), + HttpConnError::ComputeCtl(_) => "could not set up the JWT authorization database extension".to_string(), HttpConnError::JwtPayloadError(p) => p.to_string(), HttpConnError::GetAuthInfo(c) => c.to_string_client(), HttpConnError::AuthError(c) => c.to_string_client(), @@ -378,6 +419,7 @@ impl CouldRetry for HttpConnError { match self { HttpConnError::PostgresConnectionError(e) => e.could_retry(), HttpConnError::LocalProxyConnectionError(e) => e.could_retry(), + HttpConnError::ComputeCtl(_) => false, HttpConnError::ConnectionClosedAbruptly(_) => false, HttpConnError::JwtPayloadError(_) => false, HttpConnError::GetAuthInfo(_) => false, @@ -481,7 +523,7 @@ impl ConnectMechanism for TokioMechanism { } struct HyperMechanism { - pool: Arc, + pool: Arc>, conn_info: ConnInfo, conn_id: uuid::Uuid, @@ -491,7 +533,7 @@ struct HyperMechanism { #[async_trait] impl ConnectMechanism for HyperMechanism { - type Connection = http_conn_pool::Client; + type Connection = http_conn_pool::Client; type ConnectError = HttpConnError; type Error = HttpConnError; diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index aa869ff1c0..7fa3357b5b 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,31 +1,27 @@ -use std::collections::HashMap; use std::fmt; -use std::ops::Deref; use std::pin::pin; -use std::sync::atomic::{self, AtomicUsize}; use std::sync::{Arc, Weak}; use std::task::{ready, Poll}; -use std::time::Duration; -use dashmap::DashMap; use futures::future::poll_fn; use futures::Future; -use parking_lot::RwLock; -use rand::Rng; use smallvec::SmallVec; use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; -use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; +use tokio_postgres::{AsyncMessage, Socket}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, info_span, warn, Instrument, Span}; +use tracing::{error, info, info_span, warn, Instrument}; +#[cfg(test)] +use { + super::conn_pool_lib::GlobalConnPoolOptions, + crate::auth::backend::ComputeUserInfo, + std::{sync::atomic, time::Duration}, +}; -use super::backend::HttpConnError; -use crate::auth::backend::ComputeUserInfo; +use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool}; use crate::context::RequestMonitoring; -use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; -use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; -use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{DbName, EndpointCacheKey, RoleName}; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::metrics::Metrics; #[derive(Debug, Clone)] pub(crate) struct ConnInfoWithAuth { @@ -33,34 +29,12 @@ pub(crate) struct ConnInfoWithAuth { pub(crate) auth: AuthData, } -#[derive(Debug, Clone)] -pub(crate) struct ConnInfo { - pub(crate) user_info: ComputeUserInfo, - pub(crate) dbname: DbName, -} - #[derive(Debug, Clone)] pub(crate) enum AuthData { Password(SmallVec<[u8; 16]>), Jwt(String), } -impl ConnInfo { - // hm, change to hasher to avoid cloning? - pub(crate) fn db_and_user(&self) -> (DbName, RoleName) { - (self.dbname.clone(), self.user_info.user.clone()) - } - - pub(crate) fn endpoint_cache_key(&self) -> Option { - // We don't want to cache http connections for ephemeral endpoints. - if self.user_info.options.is_ephemeral() { - None - } else { - Some(self.user_info.endpoint_cache_key()) - } - } -} - impl fmt::Display for ConnInfo { // use custom display to avoid logging password fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -75,402 +49,6 @@ impl fmt::Display for ConnInfo { } } -struct ConnPoolEntry { - conn: ClientInner, - _last_access: std::time::Instant, -} - -// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool -// Number of open connections is limited by the `max_conns_per_endpoint`. -pub(crate) struct EndpointConnPool { - pools: HashMap<(DbName, RoleName), DbUserConnPool>, - total_conns: usize, - max_conns: usize, - _guard: HttpEndpointPoolsGuard<'static>, - global_connections_count: Arc, - global_pool_size_max_conns: usize, -} - -impl EndpointConnPool { - fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option> { - let Self { - pools, - total_conns, - global_connections_count, - .. - } = self; - pools.get_mut(&db_user).and_then(|pool_entries| { - pool_entries.get_conn_entry(total_conns, global_connections_count.clone()) - }) - } - - fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool { - let Self { - pools, - total_conns, - global_connections_count, - .. - } = self; - if let Some(pool) = pools.get_mut(&db_user) { - let old_len = pool.conns.len(); - pool.conns.retain(|conn| conn.conn.conn_id != conn_id); - let new_len = pool.conns.len(); - let removed = old_len - new_len; - if removed > 0 { - global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .dec_by(removed as i64); - } - *total_conns -= removed; - removed > 0 - } else { - false - } - } - - fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInner) { - let conn_id = client.conn_id; - - if client.is_closed() { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed"); - return; - } - let global_max_conn = pool.read().global_pool_size_max_conns; - if pool - .read() - .global_connections_count - .load(atomic::Ordering::Relaxed) - >= global_max_conn - { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full"); - return; - } - - // return connection to the pool - let mut returned = false; - let mut per_db_size = 0; - let total_conns = { - let mut pool = pool.write(); - - if pool.total_conns < pool.max_conns { - let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default(); - pool_entries.conns.push(ConnPoolEntry { - conn: client, - _last_access: std::time::Instant::now(), - }); - - returned = true; - per_db_size = pool_entries.conns.len(); - - pool.total_conns += 1; - pool.global_connections_count - .fetch_add(1, atomic::Ordering::Relaxed); - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .inc(); - } - - pool.total_conns - }; - - // do logging outside of the mutex - if returned { - info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); - } else { - info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); - } - } -} - -impl Drop for EndpointConnPool { - fn drop(&mut self) { - if self.total_conns > 0 { - self.global_connections_count - .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .dec_by(self.total_conns as i64); - } - } -} - -pub(crate) struct DbUserConnPool { - conns: Vec>, -} - -impl Default for DbUserConnPool { - fn default() -> Self { - Self { conns: Vec::new() } - } -} - -impl DbUserConnPool { - fn clear_closed_clients(&mut self, conns: &mut usize) -> usize { - let old_len = self.conns.len(); - - self.conns.retain(|conn| !conn.conn.is_closed()); - - let new_len = self.conns.len(); - let removed = old_len - new_len; - *conns -= removed; - removed - } - - fn get_conn_entry( - &mut self, - conns: &mut usize, - global_connections_count: Arc, - ) -> Option> { - let mut removed = self.clear_closed_clients(conns); - let conn = self.conns.pop(); - if conn.is_some() { - *conns -= 1; - removed += 1; - } - global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .dec_by(removed as i64); - conn - } -} - -pub(crate) struct GlobalConnPool { - // endpoint -> per-endpoint connection pool - // - // That should be a fairly conteded map, so return reference to the per-endpoint - // pool as early as possible and release the lock. - global_pool: DashMap>>>, - - /// Number of endpoint-connection pools - /// - /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. - /// That seems like far too much effort, so we're using a relaxed increment counter instead. - /// It's only used for diagnostics. - global_pool_size: AtomicUsize, - - /// Total number of connections in the pool - global_connections_count: Arc, - - config: &'static crate::config::HttpConfig, -} - -#[derive(Debug, Clone, Copy)] -pub struct GlobalConnPoolOptions { - // Maximum number of connections per one endpoint. - // Can mix different (dbname, username) connections. - // When running out of free slots for a particular endpoint, - // falls back to opening a new connection for each request. - pub max_conns_per_endpoint: usize, - - pub gc_epoch: Duration, - - pub pool_shards: usize, - - pub idle_timeout: Duration, - - pub opt_in: bool, - - // Total number of connections in the pool. - pub max_total_conns: usize, -} - -impl GlobalConnPool { - pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { - let shards = config.pool_options.pool_shards; - Arc::new(Self { - global_pool: DashMap::with_shard_amount(shards), - global_pool_size: AtomicUsize::new(0), - config, - global_connections_count: Arc::new(AtomicUsize::new(0)), - }) - } - - #[cfg(test)] - pub(crate) fn get_global_connections_count(&self) -> usize { - self.global_connections_count - .load(atomic::Ordering::Relaxed) - } - - pub(crate) fn get_idle_timeout(&self) -> Duration { - self.config.pool_options.idle_timeout - } - - pub(crate) fn shutdown(&self) { - // drops all strong references to endpoint-pools - self.global_pool.clear(); - } - - pub(crate) async fn gc_worker(&self, mut rng: impl Rng) { - let epoch = self.config.pool_options.gc_epoch; - let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); - loop { - interval.tick().await; - - let shard = rng.gen_range(0..self.global_pool.shards().len()); - self.gc(shard); - } - } - - fn gc(&self, shard: usize) { - debug!(shard, "pool: performing epoch reclamation"); - - // acquire a random shard lock - let mut shard = self.global_pool.shards()[shard].write(); - - let timer = Metrics::get() - .proxy - .http_pool_reclaimation_lag_seconds - .start_timer(); - let current_len = shard.len(); - let mut clients_removed = 0; - shard.retain(|endpoint, x| { - // if the current endpoint pool is unique (no other strong or weak references) - // then it is currently not in use by any connections. - if let Some(pool) = Arc::get_mut(x.get_mut()) { - let EndpointConnPool { - pools, total_conns, .. - } = pool.get_mut(); - - // ensure that closed clients are removed - for db_pool in pools.values_mut() { - clients_removed += db_pool.clear_closed_clients(total_conns); - } - - // we only remove this pool if it has no active connections - if *total_conns == 0 { - info!("pool: discarding pool for endpoint {endpoint}"); - return false; - } - } - - true - }); - - let new_len = shard.len(); - drop(shard); - timer.observe(); - - // Do logging outside of the lock. - if clients_removed > 0 { - let size = self - .global_connections_count - .fetch_sub(clients_removed, atomic::Ordering::Relaxed) - - clients_removed; - Metrics::get() - .proxy - .http_pool_opened_connections - .get_metric() - .dec_by(clients_removed as i64); - info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); - } - let removed = current_len - new_len; - - if removed > 0 { - let global_pool_size = self - .global_pool_size - .fetch_sub(removed, atomic::Ordering::Relaxed) - - removed; - info!("pool: performed global pool gc. size now {global_pool_size}"); - } - } - - pub(crate) fn get( - self: &Arc, - ctx: &RequestMonitoring, - conn_info: &ConnInfo, - ) -> Result>, HttpConnError> { - let mut client: Option> = None; - let Some(endpoint) = conn_info.endpoint_cache_key() else { - return Ok(None); - }; - - let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); - if let Some(entry) = endpoint_pool - .write() - .get_conn_entry(conn_info.db_and_user()) - { - client = Some(entry.conn); - } - let endpoint_pool = Arc::downgrade(&endpoint_pool); - - // ok return cached connection if found and establish a new one otherwise - if let Some(client) = client { - if client.is_closed() { - info!("pool: cached connection '{conn_info}' is closed, opening a new one"); - return Ok(None); - } - tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id)); - tracing::Span::current().record( - "pid", - tracing::field::display(client.inner.get_process_id()), - ); - info!( - cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), - "pool: reusing connection '{conn_info}'" - ); - client.session.send(ctx.session_id())?; - ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); - ctx.success(); - return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); - } - Ok(None) - } - - fn get_or_create_endpoint_pool( - self: &Arc, - endpoint: &EndpointCacheKey, - ) -> Arc>> { - // fast path - if let Some(pool) = self.global_pool.get(endpoint) { - return pool.clone(); - } - - // slow path - let new_pool = Arc::new(RwLock::new(EndpointConnPool { - pools: HashMap::new(), - total_conns: 0, - max_conns: self.config.pool_options.max_conns_per_endpoint, - _guard: Metrics::get().proxy.http_endpoint_pools.guard(), - global_connections_count: self.global_connections_count.clone(), - global_pool_size_max_conns: self.config.pool_options.max_total_conns, - })); - - // find or create a pool for this endpoint - let mut created = false; - let pool = self - .global_pool - .entry(endpoint.clone()) - .or_insert_with(|| { - created = true; - new_pool - }) - .clone(); - - // log new global pool size - if created { - let global_pool_size = self - .global_pool_size - .fetch_add(1, atomic::Ordering::Relaxed) - + 1; - info!( - "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}" - ); - } - - pool - } -} - pub(crate) fn poll_client( global_pool: Arc>, ctx: &RequestMonitoring, @@ -574,7 +152,7 @@ pub(crate) fn poll_client( } .instrument(span)); - let inner = ClientInner { + let inner = ClientInnerRemote { inner: client, session: tx, cancel, @@ -584,7 +162,7 @@ pub(crate) fn poll_client( Client::new(inner, conn_info, pool_clone) } -struct ClientInner { +pub(crate) struct ClientInnerRemote { inner: C, session: tokio::sync::watch::Sender, cancel: CancellationToken, @@ -592,131 +170,36 @@ struct ClientInner { conn_id: uuid::Uuid, } -impl Drop for ClientInner { - fn drop(&mut self) { - // on client drop, tell the conn to shut down - self.cancel.cancel(); +impl ClientInnerRemote { + pub(crate) fn inner_mut(&mut self) -> &mut C { + &mut self.inner } -} -pub(crate) trait ClientInnerExt: Sync + Send + 'static { - fn is_closed(&self) -> bool; - fn get_process_id(&self) -> i32; -} - -impl ClientInnerExt for tokio_postgres::Client { - fn is_closed(&self) -> bool { - self.is_closed() + pub(crate) fn inner(&self) -> &C { + &self.inner + } + + pub(crate) fn session(&mut self) -> &mut tokio::sync::watch::Sender { + &mut self.session + } + + pub(crate) fn aux(&self) -> &MetricsAuxInfo { + &self.aux + } + + pub(crate) fn get_conn_id(&self) -> uuid::Uuid { + self.conn_id } - fn get_process_id(&self) -> i32 { - self.get_process_id() - } -} -impl ClientInner { pub(crate) fn is_closed(&self) -> bool { self.inner.is_closed() } } -impl Client { - pub(crate) fn metrics(&self) -> Arc { - let aux = &self.inner.as_ref().unwrap().aux; - USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id, - branch_id: aux.branch_id, - }) - } -} - -pub(crate) struct Client { - span: Span, - inner: Option>, - conn_info: ConnInfo, - pool: Weak>>, -} - -pub(crate) struct Discard<'a, C: ClientInnerExt> { - conn_info: &'a ConnInfo, - pool: &'a mut Weak>>, -} - -impl Client { - pub(self) fn new( - inner: ClientInner, - conn_info: ConnInfo, - pool: Weak>>, - ) -> Self { - Self { - inner: Some(inner), - span: Span::current(), - conn_info, - pool, - } - } - pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) { - let Self { - inner, - pool, - conn_info, - span: _, - } = self; - let inner = inner.as_mut().expect("client inner should not be removed"); - (&mut inner.inner, Discard { conn_info, pool }) - } -} - -impl Discard<'_, C> { - pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) { - let conn_info = &self.conn_info; - if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { - info!("pool: throwing away connection '{conn_info}' because connection is not idle"); - } - } - pub(crate) fn discard(&mut self) { - let conn_info = &self.conn_info; - if std::mem::take(self.pool).strong_count() > 0 { - info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); - } - } -} - -impl Deref for Client { - type Target = C; - - fn deref(&self) -> &Self::Target { - &self - .inner - .as_ref() - .expect("client inner should not be removed") - .inner - } -} - -impl Client { - fn do_drop(&mut self) -> Option { - let conn_info = self.conn_info.clone(); - let client = self - .inner - .take() - .expect("client inner should not be removed"); - if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { - let current_span = self.span.clone(); - // return connection to the pool - return Some(move || { - let _span = current_span.enter(); - EndpointConnPool::put(&conn_pool, &conn_info, client); - }); - } - None - } -} - -impl Drop for Client { +impl Drop for ClientInnerRemote { fn drop(&mut self) { - if let Some(drop) = self.do_drop() { - tokio::task::spawn_blocking(drop); - } + // on client drop, tell the conn to shut down + self.cancel.cancel(); } } @@ -728,7 +211,7 @@ mod tests { use super::*; use crate::proxy::NeonOptions; use crate::serverless::cancel_set::CancelSet; - use crate::{BranchId, EndpointId, ProjectId}; + use crate::types::{BranchId, EndpointId, ProjectId}; struct MockClient(Arc); impl MockClient { @@ -745,12 +228,12 @@ mod tests { } } - fn create_inner() -> ClientInner { + fn create_inner() -> ClientInnerRemote { create_inner_with(MockClient::new(false)) } - fn create_inner_with(client: MockClient) -> ClientInner { - ClientInner { + fn create_inner_with(client: MockClient) -> ClientInnerRemote { + ClientInnerRemote { inner: client, session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()), cancel: CancellationToken::new(), @@ -797,7 +280,7 @@ mod tests { { let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); assert_eq!(0, pool.get_global_connections_count()); - client.inner().1.discard(); + client.inner_mut().1.discard(); // Discard should not add the connection from the pool. assert_eq!(0, pool.get_global_connections_count()); } diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs new file mode 100644 index 0000000000..8830cddf0c --- /dev/null +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -0,0 +1,560 @@ +use std::collections::HashMap; +use std::ops::Deref; +use std::sync::atomic::{self, AtomicUsize}; +use std::sync::{Arc, Weak}; +use std::time::Duration; + +use dashmap::DashMap; +use parking_lot::RwLock; +use rand::Rng; +use tokio_postgres::ReadyForQueryStatus; +use tracing::{debug, info, Span}; + +use super::backend::HttpConnError; +use super::conn_pool::ClientInnerRemote; +use crate::auth::backend::ComputeUserInfo; +use crate::context::RequestMonitoring; +use crate::control_plane::messages::ColdStartInfo; +use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; +use crate::types::{DbName, EndpointCacheKey, RoleName}; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; + +#[derive(Debug, Clone)] +pub(crate) struct ConnInfo { + pub(crate) user_info: ComputeUserInfo, + pub(crate) dbname: DbName, +} + +impl ConnInfo { + // hm, change to hasher to avoid cloning? + pub(crate) fn db_and_user(&self) -> (DbName, RoleName) { + (self.dbname.clone(), self.user_info.user.clone()) + } + + pub(crate) fn endpoint_cache_key(&self) -> Option { + // We don't want to cache http connections for ephemeral endpoints. + if self.user_info.options.is_ephemeral() { + None + } else { + Some(self.user_info.endpoint_cache_key()) + } + } +} + +pub(crate) struct ConnPoolEntry { + pub(crate) conn: ClientInnerRemote, + pub(crate) _last_access: std::time::Instant, +} + +// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool +// Number of open connections is limited by the `max_conns_per_endpoint`. +pub(crate) struct EndpointConnPool { + pools: HashMap<(DbName, RoleName), DbUserConnPool>, + total_conns: usize, + max_conns: usize, + _guard: HttpEndpointPoolsGuard<'static>, + global_connections_count: Arc, + global_pool_size_max_conns: usize, +} + +impl EndpointConnPool { + fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option> { + let Self { + pools, + total_conns, + global_connections_count, + .. + } = self; + pools.get_mut(&db_user).and_then(|pool_entries| { + let (entry, removed) = pool_entries.get_conn_entry(total_conns); + global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); + entry + }) + } + + pub(crate) fn remove_client( + &mut self, + db_user: (DbName, RoleName), + conn_id: uuid::Uuid, + ) -> bool { + let Self { + pools, + total_conns, + global_connections_count, + .. + } = self; + if let Some(pool) = pools.get_mut(&db_user) { + let old_len = pool.conns.len(); + pool.conns.retain(|conn| conn.conn.get_conn_id() != conn_id); + let new_len = pool.conns.len(); + let removed = old_len - new_len; + if removed > 0 { + global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); + } + *total_conns -= removed; + removed > 0 + } else { + false + } + } + + pub(crate) fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInnerRemote) { + let conn_id = client.get_conn_id(); + + if client.is_closed() { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed"); + return; + } + + let global_max_conn = pool.read().global_pool_size_max_conns; + if pool + .read() + .global_connections_count + .load(atomic::Ordering::Relaxed) + >= global_max_conn + { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full"); + return; + } + + // return connection to the pool + let mut returned = false; + let mut per_db_size = 0; + let total_conns = { + let mut pool = pool.write(); + + if pool.total_conns < pool.max_conns { + let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default(); + pool_entries.conns.push(ConnPoolEntry { + conn: client, + _last_access: std::time::Instant::now(), + }); + + returned = true; + per_db_size = pool_entries.conns.len(); + + pool.total_conns += 1; + pool.global_connections_count + .fetch_add(1, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .inc(); + } + + pool.total_conns + }; + + // do logging outside of the mutex + if returned { + info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); + } else { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); + } + } +} + +impl Drop for EndpointConnPool { + fn drop(&mut self) { + if self.total_conns > 0 { + self.global_connections_count + .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(self.total_conns as i64); + } + } +} + +pub(crate) struct DbUserConnPool { + pub(crate) conns: Vec>, +} + +impl Default for DbUserConnPool { + fn default() -> Self { + Self { conns: Vec::new() } + } +} + +impl DbUserConnPool { + fn clear_closed_clients(&mut self, conns: &mut usize) -> usize { + let old_len = self.conns.len(); + + self.conns.retain(|conn| !conn.conn.is_closed()); + + let new_len = self.conns.len(); + let removed = old_len - new_len; + *conns -= removed; + removed + } + + pub(crate) fn get_conn_entry( + &mut self, + conns: &mut usize, + ) -> (Option>, usize) { + let mut removed = self.clear_closed_clients(conns); + let conn = self.conns.pop(); + if conn.is_some() { + *conns -= 1; + removed += 1; + } + + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); + + (conn, removed) + } +} + +pub(crate) struct GlobalConnPool { + // endpoint -> per-endpoint connection pool + // + // That should be a fairly conteded map, so return reference to the per-endpoint + // pool as early as possible and release the lock. + global_pool: DashMap>>>, + + /// Number of endpoint-connection pools + /// + /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. + /// That seems like far too much effort, so we're using a relaxed increment counter instead. + /// It's only used for diagnostics. + global_pool_size: AtomicUsize, + + /// Total number of connections in the pool + global_connections_count: Arc, + + config: &'static crate::config::HttpConfig, +} + +#[derive(Debug, Clone, Copy)] +pub struct GlobalConnPoolOptions { + // Maximum number of connections per one endpoint. + // Can mix different (dbname, username) connections. + // When running out of free slots for a particular endpoint, + // falls back to opening a new connection for each request. + pub max_conns_per_endpoint: usize, + + pub gc_epoch: Duration, + + pub pool_shards: usize, + + pub idle_timeout: Duration, + + pub opt_in: bool, + + // Total number of connections in the pool. + pub max_total_conns: usize, +} + +impl GlobalConnPool { + pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { + let shards = config.pool_options.pool_shards; + Arc::new(Self { + global_pool: DashMap::with_shard_amount(shards), + global_pool_size: AtomicUsize::new(0), + config, + global_connections_count: Arc::new(AtomicUsize::new(0)), + }) + } + + #[cfg(test)] + pub(crate) fn get_global_connections_count(&self) -> usize { + self.global_connections_count + .load(atomic::Ordering::Relaxed) + } + + pub(crate) fn get_idle_timeout(&self) -> Duration { + self.config.pool_options.idle_timeout + } + + pub(crate) fn shutdown(&self) { + // drops all strong references to endpoint-pools + self.global_pool.clear(); + } + + pub(crate) async fn gc_worker(&self, mut rng: impl Rng) { + let epoch = self.config.pool_options.gc_epoch; + let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); + loop { + interval.tick().await; + + let shard = rng.gen_range(0..self.global_pool.shards().len()); + self.gc(shard); + } + } + + pub(crate) fn gc(&self, shard: usize) { + debug!(shard, "pool: performing epoch reclamation"); + + // acquire a random shard lock + let mut shard = self.global_pool.shards()[shard].write(); + + let timer = Metrics::get() + .proxy + .http_pool_reclaimation_lag_seconds + .start_timer(); + let current_len = shard.len(); + let mut clients_removed = 0; + shard.retain(|endpoint, x| { + // if the current endpoint pool is unique (no other strong or weak references) + // then it is currently not in use by any connections. + if let Some(pool) = Arc::get_mut(x.get_mut()) { + let EndpointConnPool { + pools, total_conns, .. + } = pool.get_mut(); + + // ensure that closed clients are removed + for db_pool in pools.values_mut() { + clients_removed += db_pool.clear_closed_clients(total_conns); + } + + // we only remove this pool if it has no active connections + if *total_conns == 0 { + info!("pool: discarding pool for endpoint {endpoint}"); + return false; + } + } + + true + }); + + let new_len = shard.len(); + drop(shard); + timer.observe(); + + // Do logging outside of the lock. + if clients_removed > 0 { + let size = self + .global_connections_count + .fetch_sub(clients_removed, atomic::Ordering::Relaxed) + - clients_removed; + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(clients_removed as i64); + info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); + } + let removed = current_len - new_len; + + if removed > 0 { + let global_pool_size = self + .global_pool_size + .fetch_sub(removed, atomic::Ordering::Relaxed) + - removed; + info!("pool: performed global pool gc. size now {global_pool_size}"); + } + } + + pub(crate) fn get_or_create_endpoint_pool( + self: &Arc, + endpoint: &EndpointCacheKey, + ) -> Arc>> { + // fast path + if let Some(pool) = self.global_pool.get(endpoint) { + return pool.clone(); + } + + // slow path + let new_pool = Arc::new(RwLock::new(EndpointConnPool { + pools: HashMap::new(), + total_conns: 0, + max_conns: self.config.pool_options.max_conns_per_endpoint, + _guard: Metrics::get().proxy.http_endpoint_pools.guard(), + global_connections_count: self.global_connections_count.clone(), + global_pool_size_max_conns: self.config.pool_options.max_total_conns, + })); + + // find or create a pool for this endpoint + let mut created = false; + let pool = self + .global_pool + .entry(endpoint.clone()) + .or_insert_with(|| { + created = true; + new_pool + }) + .clone(); + + // log new global pool size + if created { + let global_pool_size = self + .global_pool_size + .fetch_add(1, atomic::Ordering::Relaxed) + + 1; + info!( + "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}" + ); + } + + pool + } + + pub(crate) fn get( + self: &Arc, + ctx: &RequestMonitoring, + conn_info: &ConnInfo, + ) -> Result>, HttpConnError> { + let mut client: Option> = None; + let Some(endpoint) = conn_info.endpoint_cache_key() else { + return Ok(None); + }; + + let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); + if let Some(entry) = endpoint_pool + .write() + .get_conn_entry(conn_info.db_and_user()) + { + client = Some(entry.conn); + } + let endpoint_pool = Arc::downgrade(&endpoint_pool); + + // ok return cached connection if found and establish a new one otherwise + if let Some(mut client) = client { + if client.is_closed() { + info!("pool: cached connection '{conn_info}' is closed, opening a new one"); + return Ok(None); + } + tracing::Span::current() + .record("conn_id", tracing::field::display(client.get_conn_id())); + tracing::Span::current().record( + "pid", + tracing::field::display(client.inner().get_process_id()), + ); + info!( + cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), + "pool: reusing connection '{conn_info}'" + ); + + client.session().send(ctx.session_id())?; + ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); + ctx.success(); + return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); + } + Ok(None) + } +} + +impl Client { + pub(crate) fn new( + inner: ClientInnerRemote, + conn_info: ConnInfo, + pool: Weak>>, + ) -> Self { + Self { + inner: Some(inner), + span: Span::current(), + conn_info, + pool, + } + } + + pub(crate) fn inner_mut(&mut self) -> (&mut C, Discard<'_, C>) { + let Self { + inner, + pool, + conn_info, + span: _, + } = self; + let inner = inner.as_mut().expect("client inner should not be removed"); + let inner_ref = inner.inner_mut(); + (inner_ref, Discard { conn_info, pool }) + } + + pub(crate) fn metrics(&self) -> Arc { + let aux = &self.inner.as_ref().unwrap().aux(); + USAGE_METRICS.register(Ids { + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, + }) + } + + pub(crate) fn do_drop(&mut self) -> Option> { + let conn_info = self.conn_info.clone(); + let client = self + .inner + .take() + .expect("client inner should not be removed"); + if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { + let current_span = self.span.clone(); + // return connection to the pool + return Some(move || { + let _span = current_span.enter(); + EndpointConnPool::put(&conn_pool, &conn_info, client); + }); + } + None + } +} + +pub(crate) struct Client { + span: Span, + inner: Option>, + conn_info: ConnInfo, + pool: Weak>>, +} + +impl Drop for Client { + fn drop(&mut self) { + if let Some(drop) = self.do_drop() { + tokio::task::spawn_blocking(drop); + } + } +} + +impl Deref for Client { + type Target = C; + + fn deref(&self) -> &Self::Target { + self.inner + .as_ref() + .expect("client inner should not be removed") + .inner() + } +} + +pub(crate) trait ClientInnerExt: Sync + Send + 'static { + fn is_closed(&self) -> bool; + fn get_process_id(&self) -> i32; +} + +impl ClientInnerExt for tokio_postgres::Client { + fn is_closed(&self) -> bool { + self.is_closed() + } + + fn get_process_id(&self) -> i32 { + self.get_process_id() + } +} + +pub(crate) struct Discard<'a, C: ClientInnerExt> { + conn_info: &'a ConnInfo, + pool: &'a mut Weak>>, +} + +impl Discard<'_, C> { + pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) { + let conn_info = &self.conn_info; + if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { + info!("pool: throwing away connection '{conn_info}' because connection is not idle"); + } + } + pub(crate) fn discard(&mut self) { + let conn_info = &self.conn_info; + if std::mem::take(self.pool).strong_count() > 0 { + info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); + } + } +} diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index 9b6bc98557..934a50c14f 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -10,27 +10,27 @@ use rand::Rng; use tokio::net::TcpStream; use tracing::{debug, error, info, info_span, Instrument}; -use super::conn_pool::ConnInfo; +use super::conn_pool_lib::{ClientInnerExt, ConnInfo}; use crate::context::RequestMonitoring; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; +use crate::types::EndpointCacheKey; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::EndpointCacheKey; pub(crate) type Send = http2::SendRequest; pub(crate) type Connect = http2::Connection, hyper::body::Incoming, TokioExecutor>; #[derive(Clone)] -struct ConnPoolEntry { - conn: Send, +pub(crate) struct ConnPoolEntry { + conn: C, conn_id: uuid::Uuid, aux: MetricsAuxInfo, } // Per-endpoint connection pool // Number of open connections is limited by the `max_conns_per_endpoint`. -pub(crate) struct EndpointConnPool { +pub(crate) struct EndpointConnPool { // TODO(conrad): // either we should open more connections depending on stream count // (not exposed by hyper, need our own counter) @@ -40,13 +40,13 @@ pub(crate) struct EndpointConnPool { // seems somewhat redundant though. // // Probably we should run a semaphore and just the single conn. TBD. - conns: VecDeque, + conns: VecDeque>, _guard: HttpEndpointPoolsGuard<'static>, global_connections_count: Arc, } -impl EndpointConnPool { - fn get_conn_entry(&mut self) -> Option { +impl EndpointConnPool { + fn get_conn_entry(&mut self) -> Option> { let Self { conns, .. } = self; loop { @@ -81,7 +81,7 @@ impl EndpointConnPool { } } -impl Drop for EndpointConnPool { +impl Drop for EndpointConnPool { fn drop(&mut self) { if !self.conns.is_empty() { self.global_connections_count @@ -95,12 +95,12 @@ impl Drop for EndpointConnPool { } } -pub(crate) struct GlobalConnPool { +pub(crate) struct GlobalConnPool { // endpoint -> per-endpoint connection pool // // That should be a fairly conteded map, so return reference to the per-endpoint // pool as early as possible and release the lock. - global_pool: DashMap>>, + global_pool: DashMap>>>, /// Number of endpoint-connection pools /// @@ -115,7 +115,7 @@ pub(crate) struct GlobalConnPool { config: &'static crate::config::HttpConfig, } -impl GlobalConnPool { +impl GlobalConnPool { pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { let shards = config.pool_options.pool_shards; Arc::new(Self { @@ -210,7 +210,7 @@ impl GlobalConnPool { self: &Arc, ctx: &RequestMonitoring, conn_info: &ConnInfo, - ) -> Option { + ) -> Option> { let endpoint = conn_info.endpoint_cache_key()?; let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); let client = endpoint_pool.write().get_conn_entry()?; @@ -228,7 +228,7 @@ impl GlobalConnPool { fn get_or_create_endpoint_pool( self: &Arc, endpoint: &EndpointCacheKey, - ) -> Arc> { + ) -> Arc>> { // fast path if let Some(pool) = self.global_pool.get(endpoint) { return pool.clone(); @@ -268,14 +268,14 @@ impl GlobalConnPool { } pub(crate) fn poll_http2_client( - global_pool: Arc, + global_pool: Arc>, ctx: &RequestMonitoring, conn_info: &ConnInfo, client: Send, connection: Connect, conn_id: uuid::Uuid, aux: MetricsAuxInfo, -) -> Client { +) -> Client { let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol()); let session_id = ctx.session_id(); @@ -322,13 +322,13 @@ pub(crate) fn poll_http2_client( Client::new(client, aux) } -pub(crate) struct Client { - pub(crate) inner: Send, +pub(crate) struct Client { + pub(crate) inner: C, aux: MetricsAuxInfo, } -impl Client { - pub(self) fn new(inner: Send, aux: MetricsAuxInfo) -> Self { +impl Client { + pub(self) fn new(inner: C, aux: MetricsAuxInfo) -> Self { Self { inner, aux } } @@ -339,3 +339,14 @@ impl Client { }) } } + +impl ClientInnerExt for Send { + fn is_closed(&self) -> bool { + self.is_closed() + } + + fn get_process_id(&self) -> i32 { + // ideally throw something meaningful + -1 + } +} diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index 8c56d317cc..569e2da571 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -155,10 +155,10 @@ fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result Result { - _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v) + pg_array_parse_inner(pg_array, elem_type, false).map(|(v, _)| v) } -fn _pg_array_parse( +fn pg_array_parse_inner( pg_array: &str, elem_type: &Type, nested: bool, @@ -211,7 +211,7 @@ fn _pg_array_parse( '{' if !quote => { level += 1; if level > 1 { - let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?; + let (res, off) = pg_array_parse_inner(&pg_array[i..], elem_type, true)?; entries.push(res); for _ in 0..off - 1 { pg_array_chr.next(); diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 5df37a8762..064e7db7b3 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -1,3 +1,14 @@ +//! Manages the pool of connections between local_proxy and postgres. +//! +//! The pool is keyed by database and role_name, and can contain multiple connections +//! shared between users. +//! +//! The pool manages the pg_session_jwt extension used for authorizing +//! requests in the db. +//! +//! The first time a db/role pair is seen, local_proxy attempts to install the extension +//! and grant usage to the role on the given schema. + use std::collections::HashMap; use std::pin::pin; use std::sync::{Arc, Weak}; @@ -20,21 +31,22 @@ use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, warn, Instrument, Span}; use super::backend::HttpConnError; -use super::conn_pool::{ClientInnerExt, ConnInfo}; +use super::conn_pool_lib::{ClientInnerExt, ConnInfo}; use crate::context::RequestMonitoring; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; +use crate::types::{DbName, RoleName}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{DbName, RoleName}; + +pub(crate) const EXT_NAME: &str = "pg_session_jwt"; +pub(crate) const EXT_VERSION: &str = "0.1.2"; +pub(crate) const EXT_SCHEMA: &str = "auth"; struct ConnPoolEntry { conn: ClientInner, _last_access: std::time::Instant, } -// /// key id for the pg_session_jwt state -// static PG_SESSION_JWT_KID: AtomicU64 = AtomicU64::new(1); - // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool // Number of open connections is limited by the `max_conns_per_endpoint`. pub(crate) struct EndpointConnPool { @@ -140,11 +152,18 @@ impl Drop for EndpointConnPool { pub(crate) struct DbUserConnPool { conns: Vec>, + + // true if we have definitely installed the extension and + // granted the role access to the auth schema. + initialized: bool, } impl Default for DbUserConnPool { fn default() -> Self { - Self { conns: Vec::new() } + Self { + conns: Vec::new(), + initialized: false, + } } } @@ -199,25 +218,16 @@ impl LocalConnPool { self.config.pool_options.idle_timeout } - // pub(crate) fn shutdown(&self) { - // let mut pool = self.global_pool.write(); - // pool.pools.clear(); - // pool.total_conns = 0; - // } - pub(crate) fn get( self: &Arc, ctx: &RequestMonitoring, conn_info: &ConnInfo, ) -> Result>, HttpConnError> { - let mut client: Option> = None; - if let Some(entry) = self + let client = self .global_pool .write() .get_conn_entry(conn_info.db_and_user()) - { - client = Some(entry.conn); - } + .map(|entry| entry.conn); // ok return cached connection if found and establish a new one otherwise if let Some(client) = client { @@ -245,6 +255,23 @@ impl LocalConnPool { } Ok(None) } + + pub(crate) fn initialized(self: &Arc, conn_info: &ConnInfo) -> bool { + self.global_pool + .read() + .pools + .get(&conn_info.db_and_user()) + .map_or(false, |pool| pool.initialized) + } + + pub(crate) fn set_initialized(self: &Arc, conn_info: &ConnInfo) { + self.global_pool + .write() + .pools + .entry(conn_info.db_and_user()) + .or_default() + .initialized = true; + } } #[allow(clippy::too_many_arguments)] @@ -362,7 +389,7 @@ pub(crate) fn poll_client( LocalClient::new(inner, conn_info, pool_clone) } -struct ClientInner { +pub(crate) struct ClientInner { inner: C, session: tokio::sync::watch::Sender, cancel: CancellationToken, @@ -387,13 +414,24 @@ impl ClientInner { } } -impl LocalClient { - pub(crate) fn metrics(&self) -> Arc { - let aux = &self.inner.as_ref().unwrap().aux; - USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id, - branch_id: aux.branch_id, - }) +impl ClientInner { + pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> { + self.jti += 1; + let token = resign_jwt(&self.key, payload, self.jti)?; + + // initiates the auth session + self.inner.simple_query("discard all").await?; + self.inner + .query( + "select auth.jwt_session_init($1)", + &[&token as &(dyn ToSql + Sync)], + ) + .await?; + + let pid = self.inner.get_process_id(); + info!(pid, jti = self.jti, "user session state init"); + + Ok(()) } } @@ -422,6 +460,18 @@ impl LocalClient { pool, } } + + pub(crate) fn client_inner(&mut self) -> (&mut ClientInner, Discard<'_, C>) { + let Self { + inner, + pool, + conn_info, + span: _, + } = self; + let inner_m = inner.as_mut().expect("client inner should not be removed"); + (inner_m, Discard { conn_info, pool }) + } + pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) { let Self { inner, @@ -434,33 +484,6 @@ impl LocalClient { } } -impl LocalClient { - pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> { - let inner = self - .inner - .as_mut() - .expect("client inner should not be removed"); - - inner.jti += 1; - let token = resign_jwt(&inner.key, payload, inner.jti)?; - - // initiates the auth session - inner.inner.simple_query("discard all").await?; - inner - .inner - .query( - "select auth.jwt_session_init($1)", - &[&token as &(dyn ToSql + Sync)], - ) - .await?; - - let pid = inner.inner.get_process_id(); - info!(pid, jti = inner.jti, "user session state init"); - - Ok(()) - } -} - /// implements relatively efficient in-place json object key upserting /// /// only supports top-level keys @@ -524,25 +547,16 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { jwt } -impl Discard<'_, C> { - pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) { - let conn_info = &self.conn_info; - if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { - info!( - "local_pool: throwing away connection '{conn_info}' because connection is not idle" - ); - } - } - pub(crate) fn discard(&mut self) { - let conn_info = &self.conn_info; - if std::mem::take(self.pool).strong_count() > 0 { - info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); - } - } -} - impl LocalClient { - fn do_drop(&mut self) -> Option { + pub(crate) fn metrics(&self) -> Arc { + let aux = &self.inner.as_ref().unwrap().aux; + USAGE_METRICS.register(Ids { + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, + }) + } + + fn do_drop(&mut self) -> Option> { let conn_info = self.conn_info.clone(); let client = self .inner @@ -568,6 +582,23 @@ impl Drop for LocalClient { } } +impl Discard<'_, C> { + pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) { + let conn_info = &self.conn_info; + if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { + info!( + "local_pool: throwing away connection '{conn_info}' because connection is not idle" + ); + } + } + pub(crate) fn discard(&mut self) { + let conn_info = &self.conn_info; + if std::mem::take(self.pool).strong_count() > 0 { + info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); + } + } +} + #[cfg(test)] mod tests { use p256::ecdsa::SigningKey; diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 3ed3b6c845..29ff7b9d91 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -5,6 +5,7 @@ mod backend; pub mod cancel_set; mod conn_pool; +mod conn_pool_lib; mod http_conn_pool; mod http_util; mod json; @@ -20,7 +21,7 @@ use anyhow::Context; use async_trait::async_trait; use atomic_take::AtomicTake; use bytes::Bytes; -pub use conn_pool::GlobalConnPoolOptions; +pub use conn_pool_lib::GlobalConnPoolOptions; use futures::future::{select, Either}; use futures::TryFutureExt; use http::{Method, Response, StatusCode}; @@ -65,7 +66,7 @@ pub async fn task_main( } let local_pool = local_conn_pool::LocalConnPool::new(&config.http_config); - let conn_pool = conn_pool::GlobalConnPool::new(&config.http_config); + let conn_pool = conn_pool_lib::GlobalConnPool::new(&config.http_config); { let conn_pool = Arc::clone(&conn_pool); tokio::spawn(async move { diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 3d8a2adef1..8e2d4c126a 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -25,10 +25,11 @@ use urlencoding; use utils::http::error::ApiError; use super::backend::{LocalProxyConnError, PoolingBackend}; -use super::conn_pool::{AuthData, ConnInfo, ConnInfoWithAuth}; +use super::conn_pool::{AuthData, ConnInfoWithAuth}; +use super::conn_pool_lib::{self, ConnInfo}; use super::http_util::json_response; use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError}; -use super::{conn_pool, local_conn_pool}; +use super::local_conn_pool; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; use crate::auth::{endpoint_sni, ComputeUserInfoParseError}; use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; @@ -37,8 +38,8 @@ use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::metrics::{HttpDirection, Metrics}; use crate::proxy::{run_until_cancelled, NeonOptions}; use crate::serverless::backend::HttpConnError; +use crate::types::{DbName, RoleName}; use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; -use crate::{DbName, RoleName}; #[derive(serde::Deserialize)] #[serde(rename_all = "camelCase")] @@ -607,7 +608,8 @@ async fn handle_db_inner( let client = match keys.keys { ComputeCredentialKeys::JwtPayload(payload) if is_local_proxy => { let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?; - client.set_jwt_session(&payload).await?; + let (cli_inner, _dsc) = client.client_inner(); + cli_inner.set_jwt_session(&payload).await?; Client::Local(client) } _ => { @@ -1021,12 +1023,12 @@ async fn query_to_json( } enum Client { - Remote(conn_pool::Client), + Remote(conn_pool_lib::Client), Local(local_conn_pool::LocalClient), } enum Discard<'a> { - Remote(conn_pool::Discard<'a, tokio_postgres::Client>), + Remote(conn_pool_lib::Discard<'a, tokio_postgres::Client>), Local(local_conn_pool::Discard<'a, tokio_postgres::Client>), } @@ -1041,7 +1043,7 @@ impl Client { fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) { match self { Client::Remote(client) => { - let (c, d) = client.inner(); + let (c, d) = client.inner_mut(); (c, Discard::Remote(d)) } Client::Local(local_client) => { diff --git a/proxy/src/signals.rs b/proxy/src/signals.rs new file mode 100644 index 0000000000..514a83d5eb --- /dev/null +++ b/proxy/src/signals.rs @@ -0,0 +1,39 @@ +use std::convert::Infallible; + +use anyhow::bail; +use tokio_util::sync::CancellationToken; +use tracing::warn; + +/// Handle unix signals appropriately. +pub async fn handle( + token: CancellationToken, + mut refresh_config: F, +) -> anyhow::Result +where + F: FnMut(), +{ + use tokio::signal::unix::{signal, SignalKind}; + + let mut hangup = signal(SignalKind::hangup())?; + let mut interrupt = signal(SignalKind::interrupt())?; + let mut terminate = signal(SignalKind::terminate())?; + + loop { + tokio::select! { + // Hangup is commonly used for config reload. + _ = hangup.recv() => { + warn!("received SIGHUP"); + refresh_config(); + } + // Shut down the whole application. + _ = interrupt.recv() => { + warn!("received SIGINT, exiting immediately"); + bail!("interrupted"); + } + _ = terminate.recv() => { + warn!("received SIGTERM, shutting down once all existing connections have closed"); + token.cancel(); + } + } + } +} diff --git a/proxy/src/types.rs b/proxy/src/types.rs new file mode 100644 index 0000000000..b0408a51d1 --- /dev/null +++ b/proxy/src/types.rs @@ -0,0 +1,122 @@ +use crate::intern::{EndpointIdInt, EndpointIdTag, InternId}; + +macro_rules! smol_str_wrapper { + ($name:ident) => { + #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] + pub struct $name(smol_str::SmolStr); + + impl $name { + #[allow(unused)] + pub(crate) fn as_str(&self) -> &str { + self.0.as_str() + } + } + + impl std::fmt::Display for $name { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } + } + + impl std::cmp::PartialEq for $name + where + smol_str::SmolStr: std::cmp::PartialEq, + { + fn eq(&self, other: &T) -> bool { + self.0.eq(other) + } + } + + impl From for $name + where + smol_str::SmolStr: From, + { + fn from(x: T) -> Self { + Self(x.into()) + } + } + + impl AsRef for $name { + fn as_ref(&self) -> &str { + self.0.as_ref() + } + } + + impl std::ops::Deref for $name { + type Target = str; + fn deref(&self) -> &str { + &*self.0 + } + } + + impl<'de> serde::de::Deserialize<'de> for $name { + fn deserialize>(d: D) -> Result { + >::deserialize(d).map(Self) + } + } + + impl serde::Serialize for $name { + fn serialize(&self, s: S) -> Result { + self.0.serialize(s) + } + } + }; +} + +const POOLER_SUFFIX: &str = "-pooler"; + +impl EndpointId { + #[must_use] + pub fn normalize(&self) -> Self { + if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) { + stripped.into() + } else { + self.clone() + } + } + + #[must_use] + pub fn normalize_intern(&self) -> EndpointIdInt { + if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) { + EndpointIdTag::get_interner().get_or_intern(stripped) + } else { + self.into() + } + } +} + +// 90% of role name strings are 20 characters or less. +smol_str_wrapper!(RoleName); +// 50% of endpoint strings are 23 characters or less. +smol_str_wrapper!(EndpointId); +// 50% of branch strings are 23 characters or less. +smol_str_wrapper!(BranchId); +// 90% of project strings are 23 characters or less. +smol_str_wrapper!(ProjectId); + +// will usually equal endpoint ID +smol_str_wrapper!(EndpointCacheKey); + +smol_str_wrapper!(DbName); + +// postgres hostname, will likely be a port:ip addr +smol_str_wrapper!(Host); + +// Endpoints are a bit tricky. Rare they might be branches or projects. +impl EndpointId { + pub(crate) fn is_endpoint(&self) -> bool { + self.0.starts_with("ep-") + } + pub(crate) fn is_branch(&self) -> bool { + self.0.starts_with("br-") + } + // pub(crate) fn is_project(&self) -> bool { + // !self.is_endpoint() && !self.is_branch() + // } + pub(crate) fn as_branch(&self) -> BranchId { + BranchId(self.0.clone()) + } + pub(crate) fn as_project(&self) -> ProjectId { + ProjectId(self.0.clone()) + } +} diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index c5384c0b0e..c5e8588623 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -375,7 +375,7 @@ pub async fn task_backup( let now = Utc::now(); collect_metrics_backup_iteration( &USAGE_METRICS.backup_endpoints, - &storage, + storage.as_ref(), &hostname, prev, now, @@ -395,7 +395,7 @@ pub async fn task_backup( #[instrument(skip_all)] async fn collect_metrics_backup_iteration( endpoints: &DashMap, FastHasher>, - storage: &Option, + storage: Option<&GenericRemoteStorage>, hostname: &str, prev: DateTime, now: DateTime, @@ -446,7 +446,7 @@ async fn collect_metrics_backup_iteration( } async fn upload_events_chunk( - storage: &Option, + storage: Option<&GenericRemoteStorage>, chunk: EventChunk<'_, Event>, remote_path: &RemotePath, cancel: &CancellationToken, @@ -497,7 +497,8 @@ mod tests { use url::Url; use super::*; - use crate::{http, BranchId, EndpointId}; + use crate::http; + use crate::types::{BranchId, EndpointId}; #[tokio::test] async fn metrics() { @@ -577,10 +578,10 @@ mod tests { // counter is unregistered assert!(metrics.endpoints.is_empty()); - collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000) + collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000) .await; assert!(!metrics.backup_endpoints.is_empty()); - collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000) + collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000) .await; // backup counter is unregistered after the second iteration assert!(metrics.backup_endpoints.is_empty()); diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index 7e07f6a2af..330e73f02f 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -73,7 +73,7 @@ struct DropKey<'a, T> { registry: &'a Waiters, } -impl<'a, T> Drop for DropKey<'a, T> { +impl Drop for DropKey<'_, T> { fn drop(&mut self) { self.registry.0.lock().remove(&self.key); } diff --git a/pyproject.toml b/pyproject.toml index 9cd315bb96..862ed49638 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ kafka-python = "^2.0.2" [tool.poetry.group.dev.dependencies] mypy = "==1.3.0" -ruff = "^0.2.2" +ruff = "^0.7.0" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 3c5d0b12a6..92b7929c7f 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.81.0" +channel = "1.82.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 8b252b4ab4..cd82e43780 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -66,22 +66,25 @@ impl FileStorage { }) } - /// Create file storage for a new timeline, but don't persist it yet. - pub fn create_new( - timeline_dir: Utf8PathBuf, + /// Create and reliably persist new control file at given location. + /// + /// Note: we normally call this in temp directory for atomic init, so + /// interested in FileStorage as a result only in tests. + pub async fn create_new( + dir: Utf8PathBuf, conf: &SafeKeeperConf, state: TimelinePersistentState, ) -> Result { // we don't support creating new timelines in offloaded state assert!(matches!(state.eviction_state, EvictionState::Present)); - let store = FileStorage { - timeline_dir, + let mut store = FileStorage { + timeline_dir: dir, no_sync: conf.no_sync, - state, + state: state.clone(), last_persist_at: Instant::now(), }; - + store.persist(&state).await?; Ok(store) } @@ -190,8 +193,6 @@ impl TimelinePersistentState { impl Storage for FileStorage { /// Persists state durably to the underlying storage. - /// - /// For a description, see . async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> { let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer(); @@ -269,7 +270,7 @@ mod test { .await .expect("failed to create timeline dir"); let state = TimelinePersistentState::empty(); - let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?; + let storage = FileStorage::create_new(timeline_dir, conf, state.clone()).await?; Ok((storage, state)) } diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 220988c3ce..52b13dc5e3 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -12,10 +12,10 @@ use tracing::{info, warn}; use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::{ - control_file::{FileStorage, Storage}, - pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline}, + control_file::FileStorage, state::TimelinePersistentState, timeline::{Timeline, TimelineError, WalResidentTimeline}, + timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, wal_backup::copy_s3_segments, wal_storage::{wal_file_paths, WalReader}, GlobalTimelines, @@ -149,17 +149,16 @@ pub async fn handle_request(request: Request) -> Result<()> { vec![], request.until_lsn, start_lsn, - ); + )?; new_state.timeline_start_lsn = start_lsn; new_state.peer_horizon_lsn = request.until_lsn; new_state.backup_lsn = new_backup_lsn; - let mut file_storage = FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone())?; - file_storage.persist(&new_state).await?; + FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone()).await?; // now we have a ready timeline in a temp directory validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?; - load_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?; + GlobalTimelines::load_temp_timeline(request.destination_ttid, &tli_dir_path, true).await?; Ok(()) } diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index c772ae6de7..c7f5165f90 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -1,7 +1,6 @@ use anyhow::{anyhow, bail, Context, Result}; use bytes::Bytes; use camino::Utf8PathBuf; -use camino_tempfile::Utf8TempDir; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; @@ -9,7 +8,6 @@ use serde::{Deserialize, Serialize}; use std::{ cmp::min, io::{self, ErrorKind}, - sync::Arc, }; use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task}; use tokio_tar::{Archive, Builder, Header}; @@ -20,7 +18,7 @@ use tokio_util::{ use tracing::{error, info, instrument}; use crate::{ - control_file::{self, CONTROL_FILE_NAME}, + control_file::CONTROL_FILE_NAME, debug_dump, http::{ client::{self, Client}, @@ -28,13 +26,14 @@ use crate::{ }, safekeeper::Term, state::TimelinePersistentState, - timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError, WalResidentTimeline}, + timeline::WalResidentTimeline, + timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, wal_backup, - wal_storage::{self, open_wal_file, Storage}, - GlobalTimelines, SafeKeeperConf, + wal_storage::open_wal_file, + GlobalTimelines, }; use utils::{ - crashsafe::{durable_rename, fsync_async_opt}, + crashsafe::fsync_async_opt, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, logging::SecretString, lsn::Lsn, @@ -428,100 +427,9 @@ async fn pull_timeline( assert!(status.commit_lsn <= status.flush_lsn); // Finally, load the timeline. - let _tli = load_temp_timeline(conf, ttid, &tli_dir_path).await?; + let _tli = GlobalTimelines::load_temp_timeline(ttid, &tli_dir_path, false).await?; Ok(Response { safekeeper_host: host, }) } - -/// Create temp directory for a new timeline. It needs to be located on the same -/// filesystem as the rest of the timelines. It will be automatically deleted when -/// Utf8TempDir goes out of scope. -pub async fn create_temp_timeline_dir( - conf: &SafeKeeperConf, - ttid: TenantTimelineId, -) -> Result<(Utf8TempDir, Utf8PathBuf)> { - // conf.workdir is usually /storage/safekeeper/data - // will try to transform it into /storage/safekeeper/tmp - let temp_base = conf - .workdir - .parent() - .ok_or(anyhow::anyhow!("workdir has no parent"))? - .join("tmp"); - - tokio::fs::create_dir_all(&temp_base).await?; - - let tli_dir = camino_tempfile::Builder::new() - .suffix("_temptli") - .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id)) - .tempdir_in(temp_base)?; - - let tli_dir_path = tli_dir.path().to_path_buf(); - - Ok((tli_dir, tli_dir_path)) -} - -/// Do basic validation of a temp timeline, before moving it to the global map. -pub async fn validate_temp_timeline( - conf: &SafeKeeperConf, - ttid: TenantTimelineId, - path: &Utf8PathBuf, -) -> Result<(Lsn, Lsn)> { - let control_path = path.join("safekeeper.control"); - - let control_store = control_file::FileStorage::load_control_file(control_path)?; - if control_store.server.wal_seg_size == 0 { - bail!("wal_seg_size is not set"); - } - - let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?; - - let commit_lsn = control_store.commit_lsn; - let flush_lsn = wal_store.flush_lsn(); - - Ok((commit_lsn, flush_lsn)) -} - -/// Move timeline from a temp directory to the main storage, and load it to the global map. -/// -/// This operation is done under a lock to prevent bugs if several concurrent requests are -/// trying to load the same timeline. Note that it doesn't guard against creating the -/// timeline with the same ttid, but no one should be doing this anyway. -pub async fn load_temp_timeline( - conf: &SafeKeeperConf, - ttid: TenantTimelineId, - tmp_path: &Utf8PathBuf, -) -> Result> { - // Take a lock to prevent concurrent loadings - let load_lock = GlobalTimelines::loading_lock().await; - let guard = load_lock.lock().await; - - if !matches!(GlobalTimelines::get(ttid), Err(TimelineError::NotFound(_))) { - bail!("timeline already exists, cannot overwrite it") - } - - // Move timeline dir to the correct location - let timeline_path = get_timeline_dir(conf, &ttid); - - info!( - "moving timeline {} from {} to {}", - ttid, tmp_path, timeline_path - ); - tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?; - // fsync tenant dir creation - fsync_async_opt(&conf.workdir, !conf.no_sync).await?; - durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?; - - let tli = GlobalTimelines::load_timeline(&guard, ttid) - .await - .context("Failed to load timeline after copy")?; - - info!( - "loaded timeline {}, flush_lsn={}", - ttid, - tli.get_flush_lsn().await - ); - - Ok(tli) -} diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index e35f806e90..3dbf72298f 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -339,7 +339,8 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { }; let tli = GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID) - .await?; + .await + .context("create timeline")?; tli.wal_residence_guard().await? } _ => { @@ -498,21 +499,18 @@ impl WalAcceptor { // we will send keepalives by replying to these requests once per second. let mut next_keepalive = Instant::now(); - loop { - let opt_msg = self.msg_rx.recv().await; - if opt_msg.is_none() { - return Ok(()); // chan closed, streaming terminated - } - let mut next_msg = opt_msg.unwrap(); - + while let Some(mut next_msg) = self.msg_rx.recv().await { // Update walreceiver state in shmem for reporting. if let ProposerAcceptorMessage::Elected(_) = &next_msg { walreceiver_guard.get().status = WalReceiverStatus::Streaming; } let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) { - // loop through AppendRequest's while it's readily available to - // write as many WAL as possible without fsyncing + // Loop through AppendRequests while available to write as many WAL records as + // possible without fsyncing. + // + // Make sure the WAL is flushed before returning, see: + // https://github.com/neondatabase/neon/issues/9259 // // Note: this will need to be rewritten if we want to read non-AppendRequest messages here. // Otherwise, we might end up in a situation where we read a message, but don't @@ -522,7 +520,7 @@ impl WalAcceptor { if let Some(reply) = self.tli.process_msg(&noflush_msg).await? { if self.reply_tx.send(reply).await.is_err() { - return Ok(()); // chan closed, streaming terminated + break; // disconnected, flush WAL and return on next send/recv } } @@ -531,11 +529,13 @@ impl WalAcceptor { break; } + // continue pulling AppendRequests if available match self.msg_rx.try_recv() { Ok(msg) => next_msg = msg, Err(TryRecvError::Empty) => break, - Err(TryRecvError::Disconnected) => return Ok(()), // chan closed, streaming terminated - } + // on disconnect, flush WAL and return on next send/recv + Err(TryRecvError::Disconnected) => break, + }; } // flush all written WAL to the disk @@ -555,5 +555,6 @@ impl WalAcceptor { next_keepalive = Instant::now() + KEEPALIVE_INTERVAL; } } + Ok(()) } } diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index 8ae749ded5..8dd873ee77 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -3,7 +3,7 @@ use std::{cmp::max, ops::Deref}; -use anyhow::Result; +use anyhow::{bail, Result}; use safekeeper_api::models::TimelineTermBumpResponse; use serde::{Deserialize, Serialize}; use utils::{ @@ -13,7 +13,11 @@ use utils::{ use crate::{ control_file, - safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory}, + safekeeper::{ + AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory, + UNKNOWN_SERVER_VERSION, + }, + timeline::TimelineError, wal_backup_partial::{self}, }; @@ -91,8 +95,24 @@ impl TimelinePersistentState { peers: Vec, commit_lsn: Lsn, local_start_lsn: Lsn, - ) -> TimelinePersistentState { - TimelinePersistentState { + ) -> anyhow::Result { + if server_info.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(*ttid)); + } + + if server_info.pg_version == UNKNOWN_SERVER_VERSION { + bail!(TimelineError::UninitialinzedPgVersion(*ttid)); + } + + if commit_lsn < local_start_lsn { + bail!( + "commit_lsn {} is smaller than local_start_lsn {}", + commit_lsn, + local_start_lsn + ); + } + + Ok(TimelinePersistentState { tenant_id: ttid.tenant_id, timeline_id: ttid.timeline_id, acceptor_state: AcceptorState { @@ -115,24 +135,23 @@ impl TimelinePersistentState { ), partial_backup: wal_backup_partial::State::default(), eviction_state: EvictionState::Present, - } + }) } #[cfg(test)] pub fn empty() -> Self { - use crate::safekeeper::UNKNOWN_SERVER_VERSION; - TimelinePersistentState::new( &TenantTimelineId::empty(), ServerInfo { - pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ - system_id: 0, /* Postgres system identifier */ - wal_seg_size: 0, + pg_version: 17, /* Postgres server version */ + system_id: 0, /* Postgres system identifier */ + wal_seg_size: 16 * 1024 * 1024, }, vec![], Lsn::INVALID, Lsn::INVALID, ) + .unwrap() } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 3494b0b764..dd4d161226 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -27,11 +27,11 @@ use utils::{ use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; +use crate::control_file; use crate::rate_limit::RateLimiter; use crate::receive_wal::WalReceivers; use crate::safekeeper::{ - AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn, - INVALID_TERM, + AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, Term, TermLsn, }; use crate::send_wal::WalSenders; use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState}; @@ -40,7 +40,6 @@ use crate::timeline_manager::{AtomicStatus, ManagerCtl}; use crate::timelines_set::TimelinesSet; use crate::wal_backup::{self, remote_timeline_path}; use crate::wal_backup_partial::PartialRemoteSegment; -use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS}; use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; @@ -122,7 +121,7 @@ impl<'a> WriteGuardSharedState<'a> { } } -impl<'a> Deref for WriteGuardSharedState<'a> { +impl Deref for WriteGuardSharedState<'_> { type Target = SharedState; fn deref(&self) -> &Self::Target { @@ -130,13 +129,13 @@ impl<'a> Deref for WriteGuardSharedState<'a> { } } -impl<'a> DerefMut for WriteGuardSharedState<'a> { +impl DerefMut for WriteGuardSharedState<'_> { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.guard } } -impl<'a> Drop for WriteGuardSharedState<'a> { +impl Drop for WriteGuardSharedState<'_> { fn drop(&mut self) { let term_flush_lsn = TermLsn::from((self.guard.sk.last_log_term(), self.guard.sk.flush_lsn())); @@ -326,44 +325,6 @@ pub struct SharedState { } impl SharedState { - /// Initialize fresh timeline state without persisting anything to disk. - fn create_new( - conf: &SafeKeeperConf, - ttid: &TenantTimelineId, - state: TimelinePersistentState, - ) -> Result { - if state.server.wal_seg_size == 0 { - bail!(TimelineError::UninitializedWalSegSize(*ttid)); - } - - if state.server.pg_version == UNKNOWN_SERVER_VERSION { - bail!(TimelineError::UninitialinzedPgVersion(*ttid)); - } - - if state.commit_lsn < state.local_start_lsn { - bail!( - "commit_lsn {} is higher than local_start_lsn {}", - state.commit_lsn, - state.local_start_lsn - ); - } - - // We don't want to write anything to disk, because we may have existing timeline there. - // These functions should not change anything on disk. - let timeline_dir = get_timeline_dir(conf, ttid); - let control_store = - control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?; - let wal_store = - wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?; - let sk = SafeKeeper::new(TimelineState::new(control_store), wal_store, conf.my_id)?; - - Ok(Self { - sk: StateSK::Loaded(sk), - peers_info: PeersInfo(vec![]), - wal_removal_on_hold: false, - }) - } - /// Restore SharedState from control file. If file doesn't exist, bails out. fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result { let timeline_dir = get_timeline_dir(conf, ttid); @@ -450,6 +411,8 @@ pub enum TimelineError { Cancelled(TenantTimelineId), #[error("Timeline {0} was not found in global map")] NotFound(TenantTimelineId), + #[error("Timeline {0} creation is in progress")] + CreationInProgress(TenantTimelineId), #[error("Timeline {0} exists on disk, but wasn't loaded on startup")] Invalid(TenantTimelineId), #[error("Timeline {0} is already exists")] @@ -514,7 +477,7 @@ pub struct Timeline { impl Timeline { /// Load existing timeline from disk. - pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result { + pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result> { let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); let shared_state = SharedState::restore(conf, &ttid)?; @@ -528,7 +491,7 @@ impl Timeline { let walreceivers = WalReceivers::new(); let remote_path = remote_timeline_path(&ttid)?; - Ok(Timeline { + Ok(Arc::new(Timeline { ttid, remote_path, commit_lsn_watch_tx, @@ -547,47 +510,7 @@ impl Timeline { wal_backup_active: AtomicBool::new(false), last_removed_segno: AtomicU64::new(0), mgr_status: AtomicStatus::new(), - }) - } - - /// Create a new timeline, which is not yet persisted to disk. - pub fn create_empty( - conf: &SafeKeeperConf, - ttid: TenantTimelineId, - server_info: ServerInfo, - commit_lsn: Lsn, - local_start_lsn: Lsn, - ) -> Result { - let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID); - let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = - watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID))); - let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); - - let state = - TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); - - let walreceivers = WalReceivers::new(); - let remote_path = remote_timeline_path(&ttid)?; - Ok(Timeline { - ttid, - remote_path, - commit_lsn_watch_tx, - commit_lsn_watch_rx, - term_flush_lsn_watch_tx, - term_flush_lsn_watch_rx, - shared_state_version_tx, - shared_state_version_rx, - mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?), - walsenders: WalSenders::new(walreceivers.clone()), - walreceivers, - cancel: CancellationToken::default(), - timeline_dir: get_timeline_dir(conf, &ttid), - manager_ctl: ManagerCtl::new(), - broker_active: AtomicBool::new(false), - wal_backup_active: AtomicBool::new(false), - last_removed_segno: AtomicU64::new(0), - mgr_status: AtomicStatus::new(), - }) + })) } /// Initialize fresh timeline on disk and start background tasks. If init diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 866cde3339..538bb6e5d2 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -5,11 +5,14 @@ use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; use crate::rate_limit::RateLimiter; use crate::safekeeper::ServerInfo; +use crate::state::TimelinePersistentState; use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError}; use crate::timelines_set::TimelinesSet; -use crate::SafeKeeperConf; +use crate::wal_storage::Storage; +use crate::{control_file, wal_storage, SafeKeeperConf}; use anyhow::{bail, Context, Result}; use camino::Utf8PathBuf; +use camino_tempfile::Utf8TempDir; use once_cell::sync::Lazy; use serde::Serialize; use std::collections::HashMap; @@ -17,12 +20,22 @@ use std::str::FromStr; use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; +use tokio::fs; use tracing::*; +use utils::crashsafe::{durable_rename, fsync_async_opt}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; +// Timeline entry in the global map: either a ready timeline, or mark that it is +// being created. +#[derive(Clone)] +enum GlobalMapTimeline { + CreationInProgress, + Timeline(Arc), +} + struct GlobalTimelinesState { - timelines: HashMap>, + timelines: HashMap, // A tombstone indicates this timeline used to exist has been deleted. These are used to prevent // on-demand timeline creation from recreating deleted timelines. This is only soft-enforced, as @@ -31,13 +44,9 @@ struct GlobalTimelinesState { conf: Option, broker_active_set: Arc, - load_lock: Arc>, global_rate_limiter: RateLimiter, } -// Used to prevent concurrent timeline loading. -pub struct TimelineLoadLock; - impl GlobalTimelinesState { /// Get configuration, which must be set once during init. fn get_conf(&self) -> &SafeKeeperConf { @@ -55,22 +64,16 @@ impl GlobalTimelinesState { ) } - /// Insert timeline into the map. Returns error if timeline with the same id already exists. - fn try_insert(&mut self, timeline: Arc) -> Result<()> { - let ttid = timeline.ttid; - if self.timelines.contains_key(&ttid) { - bail!(TimelineError::AlreadyExists(ttid)); - } - self.timelines.insert(ttid, timeline); - Ok(()) - } - - /// Get timeline from the map. Returns error if timeline doesn't exist. + /// Get timeline from the map. Returns error if timeline doesn't exist or + /// creation is in progress. fn get(&self, ttid: &TenantTimelineId) -> Result, TimelineError> { - self.timelines - .get(ttid) - .cloned() - .ok_or(TimelineError::NotFound(*ttid)) + match self.timelines.get(ttid).cloned() { + Some(GlobalMapTimeline::Timeline(tli)) => Ok(tli), + Some(GlobalMapTimeline::CreationInProgress) => { + Err(TimelineError::CreationInProgress(*ttid)) + } + None => Err(TimelineError::NotFound(*ttid)), + } } fn delete(&mut self, ttid: TenantTimelineId) { @@ -85,7 +88,6 @@ static TIMELINES_STATE: Lazy> = Lazy::new(|| { tombstones: HashMap::new(), conf: None, broker_active_set: Arc::new(TimelinesSet::default()), - load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)), global_rate_limiter: RateLimiter::new(1, 1), }) }); @@ -141,11 +143,10 @@ impl GlobalTimelines { /// Loads all timelines for the given tenant to memory. Returns fs::read_dir /// errors if any. /// - /// It is async for update_status_notify sake. Since TIMELINES_STATE lock is - /// sync and there is no important reason to make it async (it is always - /// held for a short while) we just lock and unlock it for each timeline -- - /// this function is called during init when nothing else is running, so - /// this is fine. + /// It is async, but TIMELINES_STATE lock is sync and there is no important + /// reason to make it async (it is always held for a short while), so we + /// just lock and unlock it for each timeline -- this function is called + /// during init when nothing else is running, so this is fine. async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> { let (conf, broker_active_set, partial_backup_rate_limiter) = { let state = TIMELINES_STATE.lock().unwrap(); @@ -163,14 +164,13 @@ impl GlobalTimelines { { let ttid = TenantTimelineId::new(tenant_id, timeline_id); match Timeline::load_timeline(&conf, ttid) { - Ok(timeline) => { - let tli = Arc::new(timeline); + Ok(tli) => { let mut shared_state = tli.write_shared_state().await; TIMELINES_STATE .lock() .unwrap() .timelines - .insert(ttid, tli.clone()); + .insert(ttid, GlobalMapTimeline::Timeline(tli.clone())); tli.bootstrap( &mut shared_state, &conf, @@ -199,51 +199,6 @@ impl GlobalTimelines { Ok(()) } - /// Take a lock for timeline loading. - pub async fn loading_lock() -> Arc> { - TIMELINES_STATE.lock().unwrap().load_lock.clone() - } - - /// Load timeline from disk to the memory. - pub async fn load_timeline<'a>( - _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>, - ttid: TenantTimelineId, - ) -> Result> { - let (conf, broker_active_set, partial_backup_rate_limiter) = - TIMELINES_STATE.lock().unwrap().get_dependencies(); - - match Timeline::load_timeline(&conf, ttid) { - Ok(timeline) => { - let tli = Arc::new(timeline); - let mut shared_state = tli.write_shared_state().await; - - // TODO: prevent concurrent timeline creation/loading - { - let mut state = TIMELINES_STATE.lock().unwrap(); - - // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`). We trust - // that the human doing this manual intervention knows what they are doing, and remove its tombstone. - if state.tombstones.remove(&ttid).is_some() { - warn!("Un-deleted timeline {ttid}"); - } - - state.timelines.insert(ttid, tli.clone()); - } - - tli.bootstrap( - &mut shared_state, - &conf, - broker_active_set, - partial_backup_rate_limiter, - ); - drop(shared_state); - Ok(tli) - } - // If we can't load a timeline, it's bad. Caller will figure it out. - Err(e) => bail!("failed to load timeline {}, reason: {:?}", ttid, e), - } - } - /// Get the number of timelines in the map. pub fn timelines_count() -> usize { TIMELINES_STATE.lock().unwrap().timelines.len() @@ -266,7 +221,7 @@ impl GlobalTimelines { commit_lsn: Lsn, local_start_lsn: Lsn, ) -> Result> { - let (conf, broker_active_set, partial_backup_rate_limiter) = { + let (conf, _, _) = { let state = TIMELINES_STATE.lock().unwrap(); if let Ok(timeline) = state.get(&ttid) { // Timeline already exists, return it. @@ -282,55 +237,146 @@ impl GlobalTimelines { info!("creating new timeline {}", ttid); - let timeline = Arc::new(Timeline::create_empty( - &conf, - ttid, - server_info, - commit_lsn, - local_start_lsn, - )?); + // Do on disk initialization in tmp dir. + let (_tmp_dir, tmp_dir_path) = create_temp_timeline_dir(&conf, ttid).await?; - // Take a lock and finish the initialization holding this mutex. No other threads - // can interfere with creation after we will insert timeline into the map. - { - let mut shared_state = timeline.write_shared_state().await; + // TODO: currently we create only cfile. It would be reasonable to + // immediately initialize first WAL segment as well. + let state = + TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?; + control_file::FileStorage::create_new(tmp_dir_path.clone(), &conf, state).await?; + let timeline = GlobalTimelines::load_temp_timeline(ttid, &tmp_dir_path, true).await?; + Ok(timeline) + } - // We can get a race condition here in case of concurrent create calls, but only - // in theory. create() will return valid timeline on the next try. - TIMELINES_STATE - .lock() - .unwrap() - .try_insert(timeline.clone())?; + /// Move timeline from a temp directory to the main storage, and load it to + /// the global map. Creating timeline in this way ensures atomicity: rename + /// is atomic, so either move of the whole datadir succeeds or it doesn't, + /// but corrupted data dir shouldn't be possible. + /// + /// We'd like to avoid holding map lock while doing IO, so it's a 3 step + /// process: + /// 1) check the global map that timeline doesn't exist and mark that we're + /// creating it; + /// 2) move the directory and load the timeline + /// 3) take lock again and insert the timeline into the global map. + pub async fn load_temp_timeline( + ttid: TenantTimelineId, + tmp_path: &Utf8PathBuf, + check_tombstone: bool, + ) -> Result> { + // Check for existence and mark that we're creating it. + let (conf, broker_active_set, partial_backup_rate_limiter) = { + let mut state = TIMELINES_STATE.lock().unwrap(); + match state.timelines.get(&ttid) { + Some(GlobalMapTimeline::CreationInProgress) => { + bail!(TimelineError::CreationInProgress(ttid)); + } + Some(GlobalMapTimeline::Timeline(_)) => { + bail!(TimelineError::AlreadyExists(ttid)); + } + _ => {} + } + if check_tombstone { + if state.tombstones.contains_key(&ttid) { + anyhow::bail!("timeline {ttid} is deleted, refusing to recreate"); + } + } else { + // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`). We trust + // that the human doing this manual intervention knows what they are doing, and remove its tombstone. + if state.tombstones.remove(&ttid).is_some() { + warn!("un-deleted timeline {ttid}"); + } + } + state + .timelines + .insert(ttid, GlobalMapTimeline::CreationInProgress); + state.get_dependencies() + }; - // Write the new timeline to the disk and start background workers. - // Bootstrap is transactional, so if it fails, the timeline will be deleted, - // and the state on disk should remain unchanged. - if let Err(e) = timeline - .init_new( - &mut shared_state, + // Do the actual move and reflect the result in the map. + match GlobalTimelines::install_temp_timeline(ttid, tmp_path, &conf).await { + Ok(timeline) => { + let mut timeline_shared_state = timeline.write_shared_state().await; + let mut state = TIMELINES_STATE.lock().unwrap(); + assert!(matches!( + state.timelines.get(&ttid), + Some(GlobalMapTimeline::CreationInProgress) + )); + + state + .timelines + .insert(ttid, GlobalMapTimeline::Timeline(timeline.clone())); + drop(state); + timeline.bootstrap( + &mut timeline_shared_state, &conf, broker_active_set, partial_backup_rate_limiter, - ) - .await - { - // Note: the most likely reason for init failure is that the timeline - // directory already exists on disk. This happens when timeline is corrupted - // and wasn't loaded from disk on startup because of that. We want to preserve - // the timeline directory in this case, for further inspection. - - // TODO: this is an unusual error, perhaps we should send it to sentry - // TODO: compute will try to create timeline every second, we should add backoff - error!("failed to init new timeline {}: {}", ttid, e); - - // Timeline failed to init, it cannot be used. Remove it from the map. - TIMELINES_STATE.lock().unwrap().timelines.remove(&ttid); - return Err(e); + ); + drop(timeline_shared_state); + Ok(timeline) + } + Err(e) => { + // Init failed, remove the marker from the map + let mut state = TIMELINES_STATE.lock().unwrap(); + assert!(matches!( + state.timelines.get(&ttid), + Some(GlobalMapTimeline::CreationInProgress) + )); + state.timelines.remove(&ttid); + Err(e) } - // We are done with bootstrap, release the lock, return the timeline. - // {} block forces release before .await } - Ok(timeline) + } + + /// Main part of load_temp_timeline: do the move and load. + async fn install_temp_timeline( + ttid: TenantTimelineId, + tmp_path: &Utf8PathBuf, + conf: &SafeKeeperConf, + ) -> Result> { + let tenant_path = get_tenant_dir(conf, &ttid.tenant_id); + let timeline_path = get_timeline_dir(conf, &ttid); + + // We must have already checked that timeline doesn't exist in the map, + // but there might be existing datadir: if timeline is corrupted it is + // not loaded. We don't want to overwrite such a dir, so check for its + // existence. + match fs::metadata(&timeline_path).await { + Ok(_) => { + // Timeline directory exists on disk, we should leave state unchanged + // and return error. + bail!(TimelineError::Invalid(ttid)); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => { + return Err(e.into()); + } + } + + info!( + "moving timeline {} from {} to {}", + ttid, tmp_path, timeline_path + ); + + // Now it is safe to move the timeline directory to the correct + // location. First, create tenant directory. Ignore error if it already + // exists. + if let Err(e) = tokio::fs::create_dir(&tenant_path).await { + if e.kind() != std::io::ErrorKind::AlreadyExists { + return Err(e.into()); + } + } + // fsync it + fsync_async_opt(&tenant_path, !conf.no_sync).await?; + // and its creation + fsync_async_opt(&conf.workdir, !conf.no_sync).await?; + + // Do the move. + durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?; + + Timeline::load_timeline(conf, ttid) } /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, @@ -358,8 +404,16 @@ impl GlobalTimelines { global_lock .timelines .values() - .filter(|t| !t.is_cancelled()) - .cloned() + .filter_map(|t| match t { + GlobalMapTimeline::Timeline(t) => { + if t.is_cancelled() { + None + } else { + Some(t.clone()) + } + } + _ => None, + }) .collect() } @@ -370,8 +424,11 @@ impl GlobalTimelines { global_lock .timelines .values() + .filter_map(|t| match t { + GlobalMapTimeline::Timeline(t) => Some(t.clone()), + _ => None, + }) .filter(|t| t.ttid.tenant_id == tenant_id) - .cloned() .collect() } @@ -504,3 +561,45 @@ fn delete_dir(path: Utf8PathBuf) -> Result { Err(e) => Err(e.into()), } } + +/// Create temp directory for a new timeline. It needs to be located on the same +/// filesystem as the rest of the timelines. It will be automatically deleted when +/// Utf8TempDir goes out of scope. +pub async fn create_temp_timeline_dir( + conf: &SafeKeeperConf, + ttid: TenantTimelineId, +) -> Result<(Utf8TempDir, Utf8PathBuf)> { + let temp_base = conf.workdir.join("tmp"); + + tokio::fs::create_dir_all(&temp_base).await?; + + let tli_dir = camino_tempfile::Builder::new() + .suffix("_temptli") + .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id)) + .tempdir_in(temp_base)?; + + let tli_dir_path = tli_dir.path().to_path_buf(); + + Ok((tli_dir, tli_dir_path)) +} + +/// Do basic validation of a temp timeline, before moving it to the global map. +pub async fn validate_temp_timeline( + conf: &SafeKeeperConf, + ttid: TenantTimelineId, + path: &Utf8PathBuf, +) -> Result<(Lsn, Lsn)> { + let control_path = path.join("safekeeper.control"); + + let control_store = control_file::FileStorage::load_control_file(control_path)?; + if control_store.server.wal_seg_size == 0 { + bail!("wal_seg_size is not set"); + } + + let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?; + + let commit_lsn = control_store.commit_lsn; + let flush_lsn = wal_store.flush_lsn(); + + Ok((commit_lsn, flush_lsn)) +} diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 6e7da94973..61d7825ae6 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -186,8 +186,14 @@ impl PhysicalStorage { "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, ); - if flush_lsn < state.commit_lsn || flush_lsn < state.peer_horizon_lsn { - warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", ttid.timeline_id); + if flush_lsn < state.commit_lsn { + bail!("timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn {} from control file", ttid.timeline_id, flush_lsn, state.commit_lsn); + } + if flush_lsn < state.peer_horizon_lsn { + warn!( + "timeline {}: flush_lsn {} is less than cfile peer_horizon_lsn {}", + ttid.timeline_id, flush_lsn, state.peer_horizon_lsn + ); } Ok(PhysicalStorage { diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 047b4be8fa..12aa025771 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -59,7 +59,7 @@ impl GlobalMap { if state.commit_lsn < state.local_start_lsn { bail!( - "commit_lsn {} is higher than local_start_lsn {}", + "commit_lsn {} is smaller than local_start_lsn {}", state.commit_lsn, state.local_start_lsn ); @@ -96,23 +96,7 @@ impl GlobalMap { let local_start_lsn = Lsn::INVALID; let state = - TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); - - if state.server.wal_seg_size == 0 { - bail!(TimelineError::UninitializedWalSegSize(ttid)); - } - - if state.server.pg_version == UNKNOWN_SERVER_VERSION { - bail!(TimelineError::UninitialinzedPgVersion(ttid)); - } - - if state.commit_lsn < state.local_start_lsn { - bail!( - "commit_lsn {} is higher than local_start_lsn {}", - state.commit_lsn, - state.local_start_lsn - ); - } + TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?; let disk_timeline = self.disk.put_state(&ttid, state); let control_store = DiskStateStorage::new(disk_timeline.clone()); diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index bafae1f551..b63a322b87 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -28,7 +28,7 @@ struct UnshardedComputeHookTenant { node_id: NodeId, // Must hold this lock to send a notification. - send_lock: Arc>>, + send_lock: Arc>>, } struct ShardedComputeHookTenant { stripe_size: ShardStripeSize, @@ -38,7 +38,22 @@ struct ShardedComputeHookTenant { // Must hold this lock to send a notification. The contents represent // the last successfully sent notification, and are used to coalesce multiple // updates by only sending when there is a chance since our last successful send. - send_lock: Arc>>, + send_lock: Arc>>, +} + +/// Represents our knowledge of the compute's state: we can update this when we get a +/// response from a notify API call, which tells us what has been applied. +/// +/// Should be wrapped in an Option<>, as we cannot always know the remote state. +#[derive(PartialEq, Eq, Debug)] +struct ComputeRemoteState { + // The request body which was acked by the compute + request: ComputeHookNotifyRequest, + + // Whether the cplane indicated that the state was applied to running computes, or just + // persisted. In the Neon control plane, this is the difference between a 423 response (meaning + // persisted but not applied), and a 2xx response (both persisted and applied) + applied: bool, } enum ComputeHookTenant { @@ -64,7 +79,7 @@ impl ComputeHookTenant { } } - fn get_send_lock(&self) -> &Arc>> { + fn get_send_lock(&self) -> &Arc>> { match self { Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock, Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock, @@ -188,11 +203,11 @@ enum MaybeSendResult { Transmit( ( ComputeHookNotifyRequest, - tokio::sync::OwnedMutexGuard>, + tokio::sync::OwnedMutexGuard>, ), ), // Something requires sending, but you must wait for a current sender then call again - AwaitLock(Arc>>), + AwaitLock(Arc>>), // Nothing requires sending Noop, } @@ -201,7 +216,7 @@ impl ComputeHookTenant { fn maybe_send( &self, tenant_id: TenantId, - lock: Option>>, + lock: Option>>, ) -> MaybeSendResult { let locked = match lock { Some(already_locked) => already_locked, @@ -257,11 +272,22 @@ impl ComputeHookTenant { tracing::info!("Tenant isn't yet ready to emit a notification"); MaybeSendResult::Noop } - Some(request) if Some(&request) == locked.as_ref() => { - // No change from the last value successfully sent + Some(request) + if Some(&request) == locked.as_ref().map(|s| &s.request) + && locked.as_ref().map(|s| s.applied).unwrap_or(false) => + { + tracing::info!( + "Skipping notification because remote state already matches ({:?})", + &request + ); + // No change from the last value successfully sent, and our state indicates that the last + // value sent was fully applied on the control plane side. MaybeSendResult::Noop } - Some(request) => MaybeSendResult::Transmit((request, locked)), + Some(request) => { + // Our request differs from the last one sent, or the last one sent was not fully applied on the compute side + MaybeSendResult::Transmit((request, locked)) + } } } } @@ -550,10 +576,28 @@ impl ComputeHook { }) }; - if result.is_ok() { - // Before dropping the send lock, stash the request we just sent so that - // subsequent callers can avoid redundantly re-sending the same thing. - *send_lock_guard = Some(request); + match result { + Ok(_) => { + // Before dropping the send lock, stash the request we just sent so that + // subsequent callers can avoid redundantly re-sending the same thing. + *send_lock_guard = Some(ComputeRemoteState { + request, + applied: true, + }); + } + Err(NotifyError::Busy) => { + // Busy result means that the server responded and has stored the new configuration, + // but was not able to fully apply it to the compute + *send_lock_guard = Some(ComputeRemoteState { + request, + applied: false, + }); + } + Err(_) => { + // General error case: we can no longer know the remote state, so clear it. This will result in + // the logic in maybe_send recognizing that we should call the hook again. + *send_lock_guard = None; + } } result } @@ -707,7 +751,10 @@ pub(crate) mod tests { assert!(request.stripe_size.is_none()); // Simulate successful send - *guard = Some(request); + *guard = Some(ComputeRemoteState { + request, + applied: true, + }); drop(guard); // Try asking again: this should be a no-op @@ -750,7 +797,10 @@ pub(crate) mod tests { assert_eq!(request.stripe_size, Some(ShardStripeSize(32768))); // Simulate successful send - *guard = Some(request); + *guard = Some(ComputeRemoteState { + request, + applied: true, + }); drop(guard); Ok(()) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 46b6f4f2bf..afefe8598c 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -381,14 +381,16 @@ async fn handle_tenant_timeline_delete( R: std::future::Future> + Send + 'static, F: Fn(Arc) -> R + Send + Sync + 'static, { + // On subsequent retries, wait longer. + // Enable callers with a 25 second request timeout to reliably get a response + const MAX_WAIT: Duration = Duration::from_secs(25); + const MAX_RETRY_PERIOD: Duration = Duration::from_secs(5); + let started_at = Instant::now(); + // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion // completed. let mut retry_period = Duration::from_secs(1); - // On subsequent retries, wait longer. - let max_retry_period = Duration::from_secs(5); - // Enable callers with a 30 second request timeout to reliably get a response - let max_wait = Duration::from_secs(25); loop { let status = f(service.clone()).await?; @@ -396,7 +398,11 @@ async fn handle_tenant_timeline_delete( StatusCode::ACCEPTED => { tracing::info!("Deletion accepted, waiting to try again..."); tokio::time::sleep(retry_period).await; - retry_period = max_retry_period; + retry_period = MAX_RETRY_PERIOD; + } + StatusCode::CONFLICT => { + tracing::info!("Deletion already in progress, waiting to try again..."); + tokio::time::sleep(retry_period).await; } StatusCode::NOT_FOUND => { tracing::info!("Deletion complete"); @@ -409,7 +415,7 @@ async fn handle_tenant_timeline_delete( } let now = Instant::now(); - if now + retry_period > started_at + max_wait { + if now + retry_period > started_at + MAX_WAIT { tracing::info!("Deletion timed out waiting for 404"); // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of // the pageserver's swagger definition for this endpoint, and has the same desired diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 5989aeba91..a1f7bc2457 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -37,6 +37,12 @@ pub(crate) struct StorageControllerMetricGroup { /// Count of how many times we spawn a reconcile task pub(crate) storage_controller_reconcile_spawn: measured::Counter, + /// Size of the in-memory map of tenant shards + pub(crate) storage_controller_tenant_shards: measured::Gauge, + + /// Size of the in-memory map of pageserver_nodes + pub(crate) storage_controller_pageserver_nodes: measured::Gauge, + /// Reconciler tasks completed, broken down by success/failure/cancelled pub(crate) storage_controller_reconcile_complete: measured::CounterVec, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 25e1fb5e1f..2cde1d6a3d 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -934,7 +934,6 @@ impl Service { self.startup_complete.clone().wait().await; const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20); - let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD); while !self.reconcilers_cancel.is_cancelled() { tokio::select! { @@ -1272,6 +1271,10 @@ impl Service { .collect::>(); let nodes: HashMap = nodes.into_iter().map(|n| (n.get_id(), n)).collect(); tracing::info!("Loaded {} nodes from database.", nodes.len()); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_nodes + .set(nodes.len() as i64); tracing::info!("Loading shards from database..."); let mut tenant_shard_persistence = persistence.list_tenant_shards().await?; @@ -2862,17 +2865,12 @@ impl Service { let _tenant_lock = trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await; - // Detach all shards - let (detach_waiters, shard_ids, node) = { - let mut shard_ids = Vec::new(); + // Detach all shards. This also deletes local pageserver shard data. + let (detach_waiters, node) = { let mut detach_waiters = Vec::new(); let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); - for (tenant_shard_id, shard) in - tenants.range_mut(TenantShardId::tenant_range(tenant_id)) - { - shard_ids.push(*tenant_shard_id); - + for (_, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { // Update the tenant's intent to remove all attachments shard.policy = PlacementPolicy::Detached; shard @@ -2892,7 +2890,7 @@ impl Service { let node = nodes .get(&node_id) .expect("Pageservers may not be deleted while lock is active"); - (detach_waiters, shard_ids, node.clone()) + (detach_waiters, node.clone()) }; // This reconcile wait can fail in a few ways: @@ -2907,38 +2905,34 @@ impl Service { self.await_waiters(detach_waiters, RECONCILE_TIMEOUT) .await?; - let locations = shard_ids - .into_iter() - .map(|s| (s, node.clone())) - .collect::>(); - let results = self.tenant_for_shards_api( - locations, - |tenant_shard_id, client| async move { client.tenant_delete(tenant_shard_id).await }, - 1, - 3, - RECONCILE_TIMEOUT, - &self.cancel, - ) - .await; - for result in results { - match result { - Ok(StatusCode::ACCEPTED) => { - // This should never happen: we waited for detaches to finish above - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Unexpectedly still attached on {}", - node - ))); - } - Ok(_) => {} - Err(mgmt_api::Error::Cancelled) => { - return Err(ApiError::ShuttingDown); - } - Err(e) => { - // This is unexpected: remote deletion should be infallible, unless the object store - // at large is unavailable. - tracing::error!("Error deleting via node {}: {e}", node); - return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); - } + // Delete the entire tenant (all shards) from remote storage via a random pageserver. + // Passing an unsharded tenant ID will cause the pageserver to remove all remote paths with + // the tenant ID prefix, including all shards (even possibly stale ones). + match node + .with_client_retries( + |client| async move { + client + .tenant_delete(TenantShardId::unsharded(tenant_id)) + .await + }, + &self.config.jwt_token, + 1, + 3, + RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + .unwrap_or(Err(mgmt_api::Error::Cancelled)) + { + Ok(_) => {} + Err(mgmt_api::Error::Cancelled) => { + return Err(ApiError::ShuttingDown); + } + Err(e) => { + // This is unexpected: remote deletion should be infallible, unless the object store + // at large is unavailable. + tracing::error!("Error deleting via node {node}: {e}"); + return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); } } @@ -3639,14 +3633,21 @@ impl Service { ); let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); - client + let res = client .timeline_delete(tenant_shard_id, timeline_id) - .await - .map_err(|e| { - ApiError::InternalServerError(anyhow::anyhow!( - "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", - )) - }) + .await; + + match res { + Ok(ok) => Ok(ok), + Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT), + Err(e) => { + Err( + ApiError::InternalServerError(anyhow::anyhow!( + "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", + )) + ) + } + } } let locations = targets.0.iter().map(|t| (*t.0, t.1.latest.node.clone())).collect(); @@ -3661,7 +3662,13 @@ impl Service { }) .await?; - // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero + // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero. + // We return 409 (Conflict) if deletion was already in progress on any of the shards + // and 202 (Accepted) if deletion was not already in progress on any of the shards. + if statuses.iter().any(|s| s == &StatusCode::CONFLICT) { + return Ok(StatusCode::CONFLICT); + } + if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) { return Ok(StatusCode::ACCEPTED); } @@ -4106,9 +4113,9 @@ impl Service { ( old_attached, generation, - old_state.policy, + old_state.policy.clone(), old_state.shard, - old_state.config, + old_state.config.clone(), ) }; @@ -5071,6 +5078,10 @@ impl Service { let mut nodes = (*locked.nodes).clone(); nodes.remove(&node_id); locked.nodes = Arc::new(nodes); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_nodes + .set(locked.nodes.len() as i64); locked.scheduler.node_remove(node_id); @@ -5154,6 +5165,10 @@ impl Service { removed_node.set_availability(NodeAvailability::Offline); } *nodes = Arc::new(nodes_mut); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_nodes + .set(nodes.len() as i64); } } @@ -5342,6 +5357,11 @@ impl Service { locked.nodes = Arc::new(new_nodes); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_nodes + .set(locked.nodes.len() as i64); + tracing::info!( "Registered pageserver {}, now have {} pageservers", register_req.node_id, diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 8a7ff866e6..e696c72ba7 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -473,6 +473,11 @@ impl TenantShard { shard: ShardIdentity, policy: PlacementPolicy, ) -> Self { + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_tenant_shards + .inc(); + Self { tenant_shard_id, policy, @@ -1384,6 +1389,11 @@ impl TenantShard { let tenant_shard_id = tsp.get_tenant_shard_id()?; let shard_identity = tsp.get_shard_identity()?; + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_tenant_shards + .inc(); + Ok(Self { tenant_shard_id, shard: shard_identity, @@ -1512,6 +1522,15 @@ impl TenantShard { } } +impl Drop for TenantShard { + fn drop(&mut self) { + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_tenant_shards + .dec(); + } +} + #[cfg(test)] pub(crate) mod tests { use std::{cell::RefCell, rc::Rc}; diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs index 70b108cf23..7b82a0b116 100644 --- a/storage_scrubber/src/cloud_admin_api.rs +++ b/storage_scrubber/src/cloud_admin_api.rs @@ -138,7 +138,7 @@ pub struct ProjectData { pub name: String, pub region_id: String, pub platform_id: String, - pub user_id: String, + pub user_id: Option, pub pageserver_id: Option, #[serde(deserialize_with = "from_nullable_id")] pub tenant: TenantId, diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index d53611ed6e..a0040ada08 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -16,13 +16,13 @@ use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePat use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; -use utils::id::TenantId; +use utils::{backoff, id::TenantId}; use crate::{ cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, init_remote, list_objects_with_retries, metadata_stream::{stream_tenant_timelines, stream_tenants}, - BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, + BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES, }; #[derive(Serialize, Deserialize, Debug)] @@ -250,13 +250,16 @@ async fn find_garbage_inner( &target.tenant_root(&tenant_shard_id), ) .await?; - let object = tenant_objects.keys.first().unwrap(); - if object.key.get_path().as_str().ends_with("heatmap-v1.json") { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)"); - garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); - continue; + if let Some(object) = tenant_objects.keys.first() { + if object.key.get_path().as_str().ends_with("heatmap-v1.json") { + tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)"); + garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); + continue; + } else { + tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key); + } } else { - tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key); + tracing::info!("Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran"); } } else { // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial @@ -406,14 +409,17 @@ pub async fn get_tenant_objects( // TODO: apply extra validation based on object modification time. Don't purge // tenants where any timeline's index_part.json has been touched recently. - let list = s3_client - .list( - Some(&tenant_root), - ListingMode::NoDelimiter, - None, - &CancellationToken::new(), - ) - .await?; + let cancel = CancellationToken::new(); + let list = backoff::retry( + || s3_client.list(Some(&tenant_root), ListingMode::NoDelimiter, None, &cancel), + |_| false, + 3, + MAX_RETRIES as u32, + "get_tenant_objects", + &cancel, + ) + .await + .expect("dummy cancellation token")?; Ok(list.keys) } @@ -424,14 +430,25 @@ pub async fn get_timeline_objects( tracing::debug!("Listing objects in timeline {ttid}"); let timeline_root = super::remote_timeline_path_id(&ttid); - let list = s3_client - .list( - Some(&timeline_root), - ListingMode::NoDelimiter, - None, - &CancellationToken::new(), - ) - .await?; + let cancel = CancellationToken::new(); + let list = backoff::retry( + || { + s3_client.list( + Some(&timeline_root), + ListingMode::NoDelimiter, + None, + &cancel, + ) + }, + |_| false, + 3, + MAX_RETRIES as u32, + "get_timeline_objects", + &cancel, + ) + .await + .expect("dummy cancellation token")?; + Ok(list.keys) } diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index 15f3665fac..6c312d0036 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -1,10 +1,12 @@ use std::{collections::HashSet, str::FromStr, sync::Arc}; +use anyhow::{bail, Context}; use futures::stream::{StreamExt, TryStreamExt}; use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; use postgres_ffi::{XLogFileName, PG_TLI}; use remote_storage::GenericRemoteStorage; +use rustls::crypto::aws_lc_rs; use serde::Serialize; use tokio_postgres::types::PgLsn; use tracing::{debug, error, info}; @@ -231,10 +233,15 @@ async fn check_timeline( }) } -fn load_certs() -> Result, std::io::Error> { - let der_certs = rustls_native_certs::load_native_certs()?; +fn load_certs() -> anyhow::Result> { + let der_certs = rustls_native_certs::load_native_certs(); + + if !der_certs.errors.is_empty() { + bail!("could not load native tls certs: {:?}", der_certs.errors); + } + let mut store = rustls::RootCertStore::empty(); - store.add_parsable_certificates(der_certs); + store.add_parsable_certificates(der_certs.certs); Ok(Arc::new(store)) } static TLS_ROOTS: OnceCell> = OnceCell::new(); @@ -248,9 +255,12 @@ async fn load_timelines_from_db( // Use rustls (Neon requires TLS) let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone(); - let client_config = rustls::ClientConfig::builder() - .with_root_certificates(root_store) - .with_no_client_auth(); + let client_config = + rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider())) + .with_safe_default_protocol_versions() + .context("aws_lc_rs should support the default protocol versions")? + .with_root_certificates(root_store) + .with_no_client_auth(); let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config); let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?; // The connection object performs the actual communication with the database, diff --git a/test_runner/README.md b/test_runner/README.md index e087241c1f..55d8d2faa9 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -6,7 +6,7 @@ Prerequisites: - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python) - Neon and Postgres binaries - See the root [README.md](/README.md) for build directions - If you want to test tests with test-only APIs, you would need to add `--features testing` to Rust code build commands. + To run tests you need to add `--features testing` to Rust code build commands. For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags. Usage example: `cargo build_testing --release` is equivalent to `cargo build --features testing --release` - Tests can be run from the git tree; or see the environment variables diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 26895df8a6..ea8291c1e0 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -28,3 +28,21 @@ class EndpointHttpClient(requests.Session): res = self.get(f"http://localhost:{self.port}/installed_extensions") res.raise_for_status() return res.json() + + def extensions(self, extension: str, version: str, database: str): + body = { + "extension": extension, + "version": version, + "database": database, + } + res = self.post(f"http://localhost:{self.port}/extensions", json=body) + res.raise_for_status() + return res.json() + + def set_role_grants(self, database: str, role: str, schema: str, privileges: list[str]): + res = self.post( + f"http://localhost:{self.port}/grants", + json={"database": database, "schema": schema, "role": role, "privileges": privileges}, + ) + res.raise_for_status() + return res.json() diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 0d3dcd1671..1b2767e296 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -1,6 +1,5 @@ from __future__ import annotations -import abc import json import os import re @@ -30,7 +29,8 @@ if TYPE_CHECKING: T = TypeVar("T") -class AbstractNeonCli(abc.ABC): +# Used to be an ABC. abc.ABC removed due to linter without name change. +class AbstractNeonCli: """ A typed wrapper around an arbitrary Neon CLI tool. Supports a way to run arbitrary command directly via CLI. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index a313ac2ed3..747c2c0d63 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -386,9 +386,9 @@ class NeonEnvBuilder: self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine - self.pageserver_default_tenant_config_compaction_algorithm: Optional[ - dict[str, Any] - ] = pageserver_default_tenant_config_compaction_algorithm + self.pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = ( + pageserver_default_tenant_config_compaction_algorithm + ) if self.pageserver_default_tenant_config_compaction_algorithm is not None: log.debug( f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}" @@ -1062,9 +1062,9 @@ class NeonEnv: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine if config.pageserver_default_tenant_config_compaction_algorithm is not None: tenant_config = ps_cfg.setdefault("tenant_config", {}) - tenant_config[ - "compaction_algorithm" - ] = config.pageserver_default_tenant_config_compaction_algorithm + tenant_config["compaction_algorithm"] = ( + config.pageserver_default_tenant_config_compaction_algorithm + ) if self.pageserver_remote_storage is not None: ps_cfg["remote_storage"] = remote_storage_to_toml_dict( @@ -1108,9 +1108,9 @@ class NeonEnv: if config.auth_enabled: sk_cfg["auth_enabled"] = True if self.safekeepers_remote_storage is not None: - sk_cfg[ - "remote_storage" - ] = self.safekeepers_remote_storage.to_toml_inline_table().strip() + sk_cfg["remote_storage"] = ( + self.safekeepers_remote_storage.to_toml_inline_table().strip() + ) self.safekeepers.append( Safekeeper(env=self, id=id, port=port, extra_opts=config.safekeeper_extra_opts) ) @@ -3175,10 +3175,13 @@ class NeonProxy(PgProtocol): # two seconds. Raises subprocess.TimeoutExpired if the proxy does not exit in time. def wait_for_exit(self, timeout=2): if self._popen: - self._popen.wait(timeout=2) + self._popen.wait(timeout=timeout) @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10) def _wait_until_ready(self): + assert ( + self._popen and self._popen.poll() is None + ), "Proxy exited unexpectedly. Check test log." requests.get(f"http://{self.host}:{self.http_port}/v1/status") def http_query(self, query, args, **kwargs): diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 377a95fbeb..4c4306be9e 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -303,9 +303,10 @@ def assert_prefix_empty( remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None, allowed_postfix: Optional[str] = None, + delimiter: str = "/", ) -> None: assert remote_storage is not None - response = list_prefix(remote_storage, prefix) + response = list_prefix(remote_storage, prefix, delimiter) keys = response["KeyCount"] objects: list[ObjectTypeDef] = response.get("Contents", []) common_prefixes = response.get("CommonPrefixes", []) @@ -338,16 +339,18 @@ def assert_prefix_empty( if not (allowed_postfix.endswith(key)): filtered_count += 1 - assert ( - filtered_count == 0 - ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}" + assert filtered_count == 0, f"remote prefix {prefix} is not empty: {objects}" # remote_storage must not be None, but that's easier for callers to make mypy happy -def assert_prefix_not_empty(remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None): +def assert_prefix_not_empty( + remote_storage: Optional[RemoteStorage], + prefix: Optional[str] = None, + delimiter: str = "/", +): assert remote_storage is not None response = list_prefix(remote_storage, prefix) - assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}" + assert response["KeyCount"] != 0, f"remote prefix {prefix} is empty: {response}" def list_prefix( diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 76575d330c..d12fa59abc 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -16,6 +16,7 @@ from typing import TYPE_CHECKING, Any, Callable, TypeVar from urllib.parse import urlencode import allure +import pytest import zstandard from psycopg2.extensions import cursor from typing_extensions import override @@ -417,7 +418,7 @@ def wait_until( time.sleep(interval) continue return res - raise Exception("timed out while waiting for %s" % func) from last_exception + raise Exception(f"timed out while waiting for {func}") from last_exception def assert_eq(a, b) -> None: @@ -634,9 +635,27 @@ def allpairs_versions(): the different versions. """ ids = [] + argvalues = [] + compat_not_defined = ( + os.getenv("COMPATIBILITY_POSTGRES_DISTRIB_DIR") is None + or os.getenv("COMPATIBILITY_NEON_BIN") is None + ) for pair in VERSIONS_COMBINATIONS: cur_id = [] + all_new = all(v == "new" for v in pair.values()) for component in sorted(pair.keys()): cur_id.append(pair[component][0]) + # Adding None if all versions are new, sof no need to mix at all + # If COMPATIBILITY_NEON_BIN or COMPATIBILITY_POSTGRES_DISTRIB_DIR are not defined, + # we will skip all the tests which include the versions mix. + argvalues.append( + pytest.param( + None if all_new else pair, + marks=pytest.mark.skipif( + compat_not_defined and not all_new, + reason="COMPATIBILITY_NEON_BIN or COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set", + ), + ) + ) ids.append(f"combination_{''.join(cur_id)}") - return {"argnames": "combination", "argvalues": VERSIONS_COMBINATIONS, "ids": ids} + return {"argnames": "combination", "argvalues": tuple(argvalues), "ids": ids} diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index dbf94a2cf5..815d186ab9 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -144,9 +144,10 @@ def test_subscriber_lag( check_pgbench_still_running(pub_workload, "pub") check_pgbench_still_running(sub_workload, "sub") - with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( - sub_connstr - ) as sub_conn: + with ( + psycopg2.connect(pub_connstr) as pub_conn, + psycopg2.connect(sub_connstr) as sub_conn, + ): with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: lag = measure_logical_replication_lag(sub_cur, pub_cur) @@ -242,9 +243,10 @@ def test_publisher_restart( ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env, ) - with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( - sub_connstr - ) as sub_conn: + with ( + psycopg2.connect(pub_connstr) as pub_conn, + psycopg2.connect(sub_connstr) as sub_conn, + ): with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: lag = measure_logical_replication_lag(sub_cur, pub_cur) diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index 14b527acca..8b368977df 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -102,10 +102,14 @@ def test_ro_replica_lag( check_pgbench_still_running(master_workload) check_pgbench_still_running(replica_workload) time.sleep(sync_interval_min * 60) - with psycopg2.connect(master_connstr) as conn_master, psycopg2.connect( - replica_connstr - ) as conn_replica: - with conn_master.cursor() as cur_master, conn_replica.cursor() as cur_replica: + with ( + psycopg2.connect(master_connstr) as conn_master, + psycopg2.connect(replica_connstr) as conn_replica, + ): + with ( + conn_master.cursor() as cur_master, + conn_replica.cursor() as cur_replica, + ): lag = measure_replication_lag(cur_master, cur_replica) log.info(f"Replica lagged behind master by {lag} seconds") zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py deleted file mode 100644 index 91d674d0db..0000000000 --- a/test_runner/regress/test_aux_files.py +++ /dev/null @@ -1,78 +0,0 @@ -from __future__ import annotations - -from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - AuxFileStore, - NeonEnvBuilder, - logical_replication_sync, -) - - -def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg): - env = neon_env_builder.init_start() - endpoint = env.endpoints.create_start("main") - client = env.pageserver.http_client() - - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - - tenant_config = client.tenant_config(tenant_id).effective_config - tenant_config["switch_aux_file_policy"] = AuxFileStore.V2 - client.set_tenant_config(tenant_id, tenant_config) - # aux file v2 is enabled on the write path, so for now, it should be unset (or null) - assert ( - client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"] - is None - ) - - pg_conn = endpoint.connect() - cur = pg_conn.cursor() - - cur.execute("create table t(pk integer primary key, payload integer)") - cur.execute( - "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));" - ) - cur.execute("create publication pub1 for table t, replication_example") - - # now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils) - # instead of going through the full logical replication process. - vanilla_pg.start() - vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)") - vanilla_pg.safe_psql( - "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);" - ) - connstr = endpoint.connstr().replace("'", "''") - log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") - vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") - - # Wait logical replication channel to be established - logical_replication_sync(vanilla_pg, endpoint) - vanilla_pg.stop() - endpoint.stop() - - with env.pageserver.http_client() as client: - # aux file v2 flag should be enabled at this point - assert ( - client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] - == AuxFileStore.V2 - ) - with env.pageserver.http_client() as client: - tenant_config = client.tenant_config(tenant_id).effective_config - tenant_config["switch_aux_file_policy"] = "V1" - client.set_tenant_config(tenant_id, tenant_config) - # the flag should still be enabled - assert ( - client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[ - "last_aux_file_policy" - ] - == AuxFileStore.V2 - ) - env.pageserver.restart() - with env.pageserver.http_client() as client: - # aux file v2 flag should be persisted - assert ( - client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[ - "last_aux_file_policy" - ] - == AuxFileStore.V2 - ) diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 04916a6b6f..0134f80769 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -74,7 +74,7 @@ def test_remote_extensions( mimetype="application/octet-stream", headers=[ ("Content-Length", str(file_size)), - ("Content-Disposition", 'attachment; filename="%s"' % file_name), + ("Content-Disposition", f'attachment; filename="{file_name}"'), ], direct_passthrough=True, ) diff --git a/test_runner/regress/test_extensions.py b/test_runner/regress/test_extensions.py new file mode 100644 index 0000000000..100fd4b048 --- /dev/null +++ b/test_runner/regress/test_extensions.py @@ -0,0 +1,50 @@ +from logging import info + +from fixtures.neon_fixtures import NeonEnv + + +def test_extensions(neon_simple_env: NeonEnv): + """basic test for the extensions endpoint testing installing extensions""" + + env = neon_simple_env + + env.create_branch("test_extensions") + + endpoint = env.endpoints.create_start("test_extensions") + extension = "neon_test_utils" + database = "test_extensions" + + endpoint.safe_psql("CREATE DATABASE test_extensions") + + with endpoint.connect(dbname=database) as pg_conn: + with pg_conn.cursor() as cur: + cur.execute( + "SELECT default_version FROM pg_available_extensions WHERE name = 'neon_test_utils'" + ) + res = cur.fetchone() + assert res is not None + version = res[0] + + with pg_conn.cursor() as cur: + cur.execute( + "SELECT extname, extversion FROM pg_extension WHERE extname = 'neon_test_utils'", + ) + res = cur.fetchone() + assert not res, "The 'neon_test_utils' extension is installed" + + client = endpoint.http_client() + install_res = client.extensions(extension, version, database) + + info("Extension install result: %s", res) + assert install_res["extension"] == extension and install_res["version"] == version + + with endpoint.connect(dbname=database) as pg_conn: + with pg_conn.cursor() as cur: + cur.execute( + "SELECT extname, extversion FROM pg_extension WHERE extname = 'neon_test_utils'", + ) + res = cur.fetchone() + assert res is not None + (db_extension_name, db_extension_version) = res + + assert db_extension_name == extension and db_extension_version == version diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 87991eadf1..c26bf058e2 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -558,10 +558,10 @@ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication return publisher_flush_lsn -# Test that subscriber takes into account quorum committed flush_lsn in -# flush_lsn reporting to publisher. Without this, it may ack too far, losing -# data on restart because publisher advances START_REPLICATION position to the -# confirmed_flush_lsn of the slot. +# Test that neon subscriber takes into account quorum committed flush_lsn in +# flush_lsn reporting to publisher. Without this, subscriber may ack too far, +# losing data on restart because publisher implicitly advances positition given +# in START_REPLICATION to the confirmed_flush_lsn of the slot. def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env # use vanilla as publisher to allow writes on it when safekeeper is down @@ -578,7 +578,10 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg): vanilla_pg.safe_psql("create extension neon;") env.create_branch("subscriber") - sub = env.endpoints.create("subscriber") + # We want all data to fit into shared_buffers because later we stop + # safekeeper and insert more; this shouldn't cause page requests as they + # will be stuck. + sub = env.endpoints.create("subscriber", config_lines=["shared_buffers=128MB"]) sub.start() with vanilla_pg.cursor() as pcur: diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index 980f6b5694..db8da51125 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -254,13 +254,13 @@ def advance_multixid_to( # missing. That's OK for our purposes. Autovacuum will print some warnings about the # missing segments, but will clean it up by truncating the SLRUs up to the new value, # closing the gap. - segname = "%04X" % MultiXactIdToOffsetSegment(next_multi_xid) + segname = f"{MultiXactIdToOffsetSegment(next_multi_xid):04X}" log.info(f"Creating dummy segment pg_multixact/offsets/{segname}") with open(vanilla_pg.pgdatadir / "pg_multixact" / "offsets" / segname, "w") as of: of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ) of.flush() - segname = "%04X" % MXOffsetToMemberSegment(next_multi_offset) + segname = f"{MXOffsetToMemberSegment(next_multi_offset):04X}" log.info(f"Creating dummy segment pg_multixact/members/{segname}") with open(vanilla_pg.pgdatadir / "pg_multixact" / "members" / segname, "w") as of: of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ) diff --git a/test_runner/regress/test_role_grants.py b/test_runner/regress/test_role_grants.py new file mode 100644 index 0000000000..b2251875f0 --- /dev/null +++ b/test_runner/regress/test_role_grants.py @@ -0,0 +1,41 @@ +import psycopg2 +from fixtures.neon_fixtures import NeonEnv + + +def test_role_grants(neon_simple_env: NeonEnv): + """basic test for the endpoint that grants permissions for a role against a schema""" + + env = neon_simple_env + + env.create_branch("test_role_grants") + + endpoint = env.endpoints.create_start("test_role_grants") + + endpoint.safe_psql("CREATE DATABASE test_role_grants") + endpoint.safe_psql("CREATE SCHEMA IF NOT EXISTS test_schema", dbname="test_role_grants") + endpoint.safe_psql("CREATE ROLE test_role WITH LOGIN", dbname="test_role_grants") + + # confirm we do not yet have access + pg_conn = endpoint.connect(dbname="test_role_grants", user="test_role") + with pg_conn.cursor() as cur: + try: + cur.execute('CREATE TABLE "test_schema"."test_table" (id integer primary key)') + raise ValueError("create table should not succeed") + except psycopg2.errors.InsufficientPrivilege: + pass + except BaseException as e: + raise e + + client = endpoint.http_client() + res = client.set_role_grants( + "test_role_grants", "test_role", "test_schema", ["CREATE", "USAGE"] + ) + + # confirm we have access + with pg_conn.cursor() as cur: + cur.execute('CREATE TABLE "test_schema"."test_table" (id integer primary key)') + cur.execute('INSERT INTO "test_schema"."test_table" (id) VALUES (1)') + cur.execute('SELECT id from "test_schema"."test_table"') + res = cur.fetchall() + + assert res == [(1,)], "select should not succeed" diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 1dcc37c407..d4bc4b1a4f 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -107,6 +107,15 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination) for tid in tenant_ids: env.create_tenant(tid, shard_count=shards_per_tenant) + # Validate high level metrics + assert ( + env.storage_controller.get_metric_value("storage_controller_tenant_shards") + == len(tenant_ids) * shards_per_tenant + ) + assert env.storage_controller.get_metric_value("storage_controller_pageserver_nodes") == len( + env.storage_controller.node_list() + ) + # Repeating a creation should be idempotent (we are just testing it doesn't return an error) env.storage_controller.tenant_create( tenant_id=next(iter(tenant_ids)), shard_count=shards_per_tenant @@ -576,6 +585,14 @@ def test_storage_controller_compute_hook( env.storage_controller.consistency_check() +NOTIFY_BLOCKED_LOG = ".*Live migration blocked.*" +NOTIFY_FAILURE_LOGS = [ + ".*Failed to notify compute.*", + ".*Reconcile error.*Cancelled", + ".*Reconcile error.*Control plane tenant busy", +] + + def test_storage_controller_stuck_compute_hook( httpserver: HTTPServer, neon_env_builder: NeonEnvBuilder, @@ -620,15 +637,8 @@ def test_storage_controller_stuck_compute_hook( dest_pageserver = env.get_pageserver(dest_ps_id) shard_0_id = TenantShardId(tenant_id, 0, 0) - NOTIFY_BLOCKED_LOG = ".*Live migration blocked.*" - env.storage_controller.allowed_errors.extend( - [ - NOTIFY_BLOCKED_LOG, - ".*Failed to notify compute.*", - ".*Reconcile error.*Cancelled", - ".*Reconcile error.*Control plane tenant busy", - ] - ) + env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG) + env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS) with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: # We expect the controller to hit the 423 (locked) and retry. Migration shouldn't complete until that @@ -719,6 +729,114 @@ def test_storage_controller_stuck_compute_hook( env.storage_controller.consistency_check() +@run_only_on_default_postgres("this test doesn't start an endpoint") +def test_storage_controller_compute_hook_revert( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address, +): + """ + 'revert' in the sense of a migration which gets reversed shortly after, as may happen during + a rolling upgrade. + + This is a reproducer for https://github.com/neondatabase/neon/issues/9417 + + The buggy behavior was that when the compute hook gave us errors, we assumed our last successfully + sent state was still in effect, so when migrating back to the original pageserver we didn't bother + notifying of that. This is wrong because even a failed request might mutate the state on the server. + """ + + # We will run two pageserver to migrate and check that the storage controller sends notifications + # when migrating. + neon_env_builder.num_pageservers = 2 + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + + # Set up fake HTTP notify endpoint + notifications = [] + + handle_params = {"status": 200} + + def handler(request: Request): + status = handle_params["status"] + log.info(f"Notify request[{status}]: {request}") + notifications.append(request.json) + return Response(status=status) + + httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + + # Start running + env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) + tenant_id = env.initial_tenant + tenant_shard_id = TenantShardId(tenant_id, 0, 0) + + pageserver_a = env.get_tenant_pageserver(tenant_id) + pageserver_b = [p for p in env.pageservers if p.id != pageserver_a.id][0] + + def notified_ps(ps_id: int) -> None: + latest = notifications[-1] + log.info(f"Waiting for {ps_id}, have {latest}") + assert latest is not None + assert latest["shards"] is not None + assert latest["shards"][0]["node_id"] == ps_id + + wait_until(30, 1, lambda: notified_ps(pageserver_a.id)) + + env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG) + env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS) + + # Migrate A -> B, and make notifications fail while this is happening + handle_params["status"] = 423 + + with pytest.raises(StorageControllerApiException, match="Timeout waiting for shard"): + # We expect the controller to give us an error because its reconciliation timed out + # waiting for the compute hook. + env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_b.id) + + # Although the migration API failed, the hook should still see pageserver B (it remembers what + # was posted even when returning an error code) + wait_until(30, 1, lambda: notified_ps(pageserver_b.id)) + + # Although the migration API failed, the tenant should still have moved to the right pageserver + assert len(pageserver_b.http_client().tenant_list()) == 1 + + # Before we clear the failure on the migration hook, we need the controller to give up + # trying to notify about B -- the bug case we're reproducing is when the controller + # _never_ successfully notified for B, then tries to notify for A. + # + # The controller will give up notifying if the origin of a migration becomes unavailable. + pageserver_a.stop() + + # Preempt heartbeats for a faster test + env.storage_controller.node_configure(pageserver_a.id, {"availability": "Offline"}) + + def logged_giving_up(): + env.storage_controller.assert_log_contains(".*Giving up on compute notification.*") + + wait_until(30, 1, logged_giving_up) + + pageserver_a.start() + + # Preempt heartbeats for determinism + env.storage_controller.node_configure(pageserver_a.id, {"availability": "Active"}) + # Starting node will prompt a reconcile to clean up old AttachedStale location, for a deterministic test + # we want that complete before we start our migration. Tolerate failure because our compute hook is + # still configured to fail + try: + env.storage_controller.reconcile_all() + except StorageControllerApiException as e: + # This exception _might_ be raised: it depends if our reconcile_all hit the on-node-activation + # Reconciler lifetime or ran after it already completed. + log.info(f"Expected error from reconcile_all: {e}") + + # Migrate B -> A, with a working compute hook: the controller should notify the hook because the + # last update it made that was acked (423) by the compute was for node B. + handle_params["status"] = 200 + env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_a.id) + + wait_until(30, 1, lambda: notified_ps(pageserver_a.id)) + + def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): """ Verify that occasional-use debug APIs work as expected. This is a lightweight test diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 294c1248c5..f486327445 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -20,6 +20,7 @@ from fixtures.pageserver.utils import ( ) from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.utils import run_pg_bench_small, wait_until +from fixtures.workload import Workload from requests.exceptions import ReadTimeout from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -404,3 +405,57 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, make_httpserver, neon_env_builder cloud_admin_api_token=cloud_admin_token, ) assert healthy + + +def test_tenant_delete_stale_shards(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + """ + Deleting a tenant should also delete any stale (pre-split) shards from remote storage. + """ + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + env = neon_env_builder.init_start() + + # Create an unsharded tenant. + tenant_id, timeline_id = env.create_tenant() + + # Write some data. + workload = Workload(env, tenant_id, timeline_id, branch_name="main") + workload.init() + workload.write_rows(256) + workload.validate() + + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join(("tenants", str(tenant_id))), + ) + + # Upload a heatmap as well. + env.pageserver.http_client().tenant_heatmap_upload(tenant_id) + + # Split off a few shards, in two rounds. + env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + env.storage_controller.tenant_shard_split(tenant_id, shard_count=16) + + # Delete the tenant. This should also delete data for the unsharded and count=4 parents. + env.storage_controller.pageserver_api().tenant_delete(tenant_id=tenant_id) + + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join(("tenants", str(tenant_id))), + delimiter="", # match partial prefixes, i.e. all shards + ) + + dirs = list(env.pageserver.tenant_dir(None).glob(f"{tenant_id}*")) + assert dirs == [], f"found tenant directories: {dirs}" + + # The initial tenant created by the test harness should still be there. + # Only the tenant we deleted should be removed. + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join(("tenants", str(env.initial_tenant))), + ) + dirs = list(env.pageserver.tenant_dir(None).glob(f"{env.initial_tenant}*")) + assert dirs != [], "missing initial tenant directory" + + env.stop() diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index ffaed5e130..cb8724dd1c 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -4,8 +4,11 @@ import pytest from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId from fixtures.neon_fixtures import ( NeonEnvBuilder, + last_flush_lsn_upload, ) from fixtures.pageserver.http import PageserverApiException +from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty +from fixtures.remote_storage import s3_storage from fixtures.utils import wait_until @@ -119,6 +122,10 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): @pytest.mark.parametrize("manual_offload", [False, True]) def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool): + if not manual_offload: + # (automatic) timeline offloading defaults to false for now + neon_env_builder.pageserver_config_override = "timeline_offloading = true" + env = neon_env_builder.init_start() ps_http = env.pageserver.http_client() @@ -164,7 +171,7 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b state=TimelineArchivalState.ARCHIVED, ) - def timeline_offloaded(timeline_id: TimelineId) -> bool: + def timeline_offloaded_logged(timeline_id: TimelineId) -> bool: return ( env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*") is not None @@ -182,12 +189,12 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b def parent_offloaded(): if manual_offload: ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id) - assert timeline_offloaded(parent_timeline_id) + assert timeline_offloaded_logged(parent_timeline_id) def leaf_offloaded(): if manual_offload: ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id) - assert timeline_offloaded(leaf_timeline_id) + assert timeline_offloaded_logged(leaf_timeline_id) wait_until(30, 1, leaf_offloaded) wait_until(30, 1, parent_offloaded) @@ -214,4 +221,118 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50") assert sum == sum_again - assert not timeline_offloaded(initial_timeline_id) + assert not timeline_offloaded_logged(initial_timeline_id) + + +def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder): + """ + Test for persistence of timeline offload state + """ + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + # Turn off gc and compaction loops: we want to issue them manually for better reliability + tenant_id, root_timeline_id = env.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": f"{1024 ** 2}", + } + ) + + # Create a branch and archive it + child_timeline_id = env.create_branch("test_archived_branch_persisted", tenant_id) + + with env.endpoints.create_start( + "test_archived_branch_persisted", tenant_id=tenant_id + ) as endpoint: + endpoint.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'data_content')", + "INSERT INTO foo SELECT FROM generate_series(1,2048)", + ] + ) + sum = endpoint.safe_psql("SELECT sum(key) from foo where key < 500") + last_flush_lsn_upload(env, endpoint, tenant_id, child_timeline_id) + + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(tenant_id)}/", + ) + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(tenant_id)}/tenant-manifest", + ) + + ps_http.timeline_archival_config( + tenant_id, + child_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + tenant_id, + child_timeline_id, + ) + assert leaf_detail["is_archived"] is True + + def timeline_offloaded_api(timeline_id: TimelineId) -> bool: + # TODO add a proper API to check if a timeline has been offloaded or not + return not any( + timeline["timeline_id"] == str(timeline_id) + for timeline in ps_http.timeline_list(tenant_id=tenant_id) + ) + + def child_offloaded(): + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id) + assert timeline_offloaded_api(child_timeline_id) + + wait_until(30, 1, child_offloaded) + + assert timeline_offloaded_api(child_timeline_id) + assert not timeline_offloaded_api(root_timeline_id) + + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(tenant_id)}/tenant-manifest", + ) + + # Test persistence, is the timeline still offloaded? + env.pageserver.stop() + env.pageserver.start() + + assert timeline_offloaded_api(child_timeline_id) + assert not timeline_offloaded_api(root_timeline_id) + + ps_http.timeline_archival_config( + tenant_id, + child_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + child_detail = ps_http.timeline_detail( + tenant_id, + child_timeline_id, + ) + assert child_detail["is_archived"] is False + + with env.endpoints.create_start( + "test_archived_branch_persisted", tenant_id=tenant_id + ) as endpoint: + sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500") + assert sum == sum_again + + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest", + ) + + assert not timeline_offloaded_api(root_timeline_id) + + ps_http.tenant_delete(tenant_id) + + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(tenant_id)}/", + ) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 306f22acf9..155709e106 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -649,7 +649,7 @@ def test_timeline_delete_works_for_remote_smoke( env = neon_env_builder.init_start() ps_http = env.pageserver.http_client() - pg = env.endpoints.create_start("main") + env.endpoints.create_start("main") tenant_id = env.initial_tenant timeline_id = env.initial_timeline diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 1347d6ddff..28c51b8ac1 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -32,7 +32,6 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt", digest = { version = "0.10", features = ["mac", "oid", "std"] } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } -futures = { version = "0.3" } futures-channel = { version = "0.3", features = ["sink"] } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } @@ -48,7 +47,7 @@ hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] } indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } -itertools = { version = "0.12" } +itertools = { version = "0.10" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -66,6 +65,8 @@ regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] } +rustls = { version = "0.23", features = ["ring"] } +rustls-webpki = { version = "0.102", default-features = false, features = ["aws_lc_rs", "ring", "std"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["alloc", "raw_value"] } @@ -79,6 +80,7 @@ tikv-jemalloc-sys = { version = "0.5" } time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2", features = ["with-serde_json-1"] } +tokio-rustls = { version = "0.26", features = ["ring"] } tokio-stream = { version = "0.1", features = ["net"] } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } toml_edit = { version = "0.22", features = ["serde"] } @@ -104,7 +106,7 @@ half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } -itertools = { version = "0.12" } +itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } @@ -122,8 +124,7 @@ regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } serde = { version = "1", features = ["alloc", "derive"] } -syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] } -syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } +syn = { version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] } toml_edit = { version = "0.22", features = ["serde"] } zstd = { version = "0.13" }