diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8215c02291..cd4906579e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -722,6 +722,35 @@ jobs: --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + --cleanup + + # Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min) + # During the transition period we need to have extensions in both places (in S3 and in compute-node image), + # so we won't build extension twice, but extract them from compute-node. + # + # For now we use extensions image only for new custom extensitons + - name: Kaniko build extensions only + run: | + # Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't. + # Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0), + # it still fails with error: + # error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists + # + # Ref https://github.com/GoogleContainerTools/kaniko/issues/1406 + find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \; + + /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \ + --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \ + --context . \ + --build-arg GIT_VERSION=${{ github.sha }} \ + --build-arg PG_VERSION=${{ matrix.version }} \ + --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \ + --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \ + --dockerfile Dockerfile.compute-node \ + --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \ + --destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \ + --cleanup \ + --target postgres-extensions # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - name: Cleanup ECR folder @@ -738,7 +767,7 @@ jobs: run: shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.11.0 + VM_BUILDER_VERSION: v0.11.1 steps: - name: Checkout @@ -840,8 +869,10 @@ jobs: crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest + crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest + crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest - name: Push images to production ECR if: | @@ -852,8 +883,10 @@ jobs: crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest - name: Configure Docker Hub login run: | @@ -875,16 +908,93 @@ jobs: crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest - name: Cleanup ECR folder run: rm -rf ~/.ecr + upload-postgres-extensions-to-s3: + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }} + needs: [ tag, promote-images ] + strategy: + fail-fast: false + matrix: + version: [ v14, v15 ] + + env: + # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image. + # Later all the extensions will be moved to extensions image. + EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest + COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest + AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }} + S3_BUCKETS: | + ${{ github.ref_name == 'release' && + 'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' || + 'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }} + + steps: + - name: Pull postgres-extensions image + run: | + docker pull ${EXTENSIONS_IMAGE} + docker pull ${COMPUTE_NODE_IMAGE} + + - name: Create postgres-extensions container + id: create-container + run: | + EID=$(docker create ${EXTENSIONS_IMAGE} true) + echo "EID=${EID}" >> $GITHUB_OUTPUT + + CID=$(docker create ${COMPUTE_NODE_IMAGE} true) + echo "CID=${CID}" >> $GITHUB_OUTPUT + + - name: Extract postgres-extensions from container + run: | + rm -rf ./extensions-to-upload ./custom-extensions # Just in case + + # In compute image we have a bit different directory layout + mkdir -p extensions-to-upload/share + docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension + docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib ./extensions-to-upload/lib + + # Delete Neon extensitons (they always present on compute-node image) + rm -rf ./extensions-to-upload/share/extension/neon* + rm -rf ./extensions-to-upload/lib/neon* + + # Delete leftovers from the extension build step + rm -rf ./extensions-to-upload/lib/pgxs + rm -rf ./extensions-to-upload/lib/pkgconfig + + docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions + for EXT_NAME in $(ls ./custom-extensions); do + mkdir -p ./extensions-to-upload/${EXT_NAME}/share + + mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension + mv ./custom-extensions/${EXT_NAME}/lib ./extensions-to-upload/${EXT_NAME}/lib + done + + - name: Upload postgres-extensions to S3 + run: | + for BUCKET in $(echo ${S3_BUCKETS}); do + aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }} + done + + - name: Cleanup + if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }} + run: | + docker rm ${{ steps.create-container.outputs.CID }} || true + docker rm ${{ steps.create-container.outputs.EID }} || true + deploy: runs-on: [ self-hosted, gen3, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - needs: [ promote-images, tag, regress-tests ] + needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ] if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' steps: - name: Fix git ownership diff --git a/Cargo.lock b/Cargo.lock index 4be74614c2..b163d4fe46 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -200,17 +200,6 @@ dependencies = [ "critical-section", ] -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi 0.1.19", - "libc", - "winapi", -] - [[package]] name = "autocfg" version = "1.1.0" @@ -805,18 +794,6 @@ dependencies = [ "libloading", ] -[[package]] -name = "clap" -version = "3.2.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123" -dependencies = [ - "bitflags", - "clap_lex 0.2.4", - "indexmap", - "textwrap", -] - [[package]] name = "clap" version = "4.3.0" @@ -837,7 +814,7 @@ dependencies = [ "anstream", "anstyle", "bitflags", - "clap_lex 0.5.0", + "clap_lex", "strsim", ] @@ -853,15 +830,6 @@ dependencies = [ "syn 2.0.16", ] -[[package]] -name = "clap_lex" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" -dependencies = [ - "os_str_bytes", -] - [[package]] name = "clap_lex" version = "0.5.0" @@ -915,7 +883,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 4.3.0", + "clap", "compute_api", "futures", "hyper", @@ -977,7 +945,7 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.3.0", + "clap", "comfy-table", "compute_api", "git-version", @@ -1047,19 +1015,19 @@ dependencies = [ [[package]] name = "criterion" -version = "0.4.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" dependencies = [ "anes", - "atty", "cast", "ciborium", - "clap 3.2.25", + "clap", "criterion-plot", + "is-terminal", "itertools", - "lazy_static", "num-traits", + "once_cell", "oorandom", "plotters", "rayon", @@ -1140,7 +1108,7 @@ dependencies = [ "crossterm_winapi", "libc", "mio", - "parking_lot", + "parking_lot 0.12.1", "signal-hook", "signal-hook-mio", "winapi", @@ -1210,7 +1178,7 @@ dependencies = [ "hashbrown 0.12.3", "lock_api", "once_cell", - "parking_lot_core", + "parking_lot_core 0.9.7", ] [[package]] @@ -1676,15 +1644,6 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" -[[package]] -name = "hermit-abi" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] - [[package]] name = "hermit-abi" version = "0.2.6" @@ -1939,6 +1898,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", ] [[package]] @@ -2267,16 +2229,6 @@ dependencies = [ "windows-sys 0.45.0", ] -[[package]] -name = "nu-ansi-term" -version = "0.46.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" -dependencies = [ - "overload", - "winapi", -] - [[package]] name = "num-bigint" version = "0.4.3" @@ -2504,31 +2456,19 @@ dependencies = [ "winapi", ] -[[package]] -name = "os_str_bytes" -version = "6.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267" - [[package]] name = "outref" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" -[[package]] -name = "overload" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" - [[package]] name = "pagectl" version = "0.1.0" dependencies = [ "anyhow", "bytes", - "clap 4.3.0", + "clap", "git-version", "pageserver", "postgres_ffi", @@ -2547,7 +2487,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.3.0", + "clap", "close_fds", "const_format", "consumption_metrics", @@ -2629,6 +2569,17 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core 0.8.6", +] + [[package]] name = "parking_lot" version = "0.12.1" @@ -2636,7 +2587,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", - "parking_lot_core", + "parking_lot_core 0.9.7", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" +dependencies = [ + "cfg-if", + "instant", + "libc", + "redox_syscall 0.2.16", + "smallvec", + "winapi", ] [[package]] @@ -2652,6 +2617,16 @@ dependencies = [ "windows-sys 0.45.0", ] +[[package]] +name = "pbkdf2" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31" +dependencies = [ + "digest", + "hmac", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -2957,7 +2932,7 @@ dependencies = [ "lazy_static", "libc", "memchr", - "parking_lot", + "parking_lot 0.12.1", "procfs", "thiserror", ] @@ -3022,12 +2997,11 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", - "atty", "base64 0.13.1", "bstr", "bytes", "chrono", - "clap 4.3.0", + "clap", "consumption_metrics", "futures", "git-version", @@ -3045,7 +3019,8 @@ dependencies = [ "native-tls", "once_cell", "opentelemetry", - "parking_lot", + "parking_lot 0.12.1", + "pbkdf2", "pin-project-lite", "postgres-native-tls", "postgres_backend", @@ -3056,6 +3031,7 @@ dependencies = [ "regex", "reqwest", "reqwest-middleware", + "reqwest-retry", "reqwest-tracing", "routerify", "rstest", @@ -3291,6 +3267,29 @@ dependencies = [ "thiserror", ] +[[package]] +name = "reqwest-retry" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4" +dependencies = [ + "anyhow", + "async-trait", + "chrono", + "futures", + "getrandom", + "http", + "hyper", + "parking_lot 0.11.2", + "reqwest", + "reqwest-middleware", + "retry-policies", + "task-local-extensions", + "tokio", + "tracing", + "wasm-timer", +] + [[package]] name = "reqwest-tracing" version = "0.4.4" @@ -3309,6 +3308,17 @@ dependencies = [ "tracing-opentelemetry", ] +[[package]] +name = "retry-policies" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b" +dependencies = [ + "anyhow", + "chrono", + "rand", +] + [[package]] name = "ring" version = "0.16.20" @@ -3507,7 +3517,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.3.0", + "clap", "const_format", "crc32c", "fs2", @@ -3518,7 +3528,7 @@ dependencies = [ "hyper", "metrics", "once_cell", - "parking_lot", + "parking_lot 0.12.1", "postgres", "postgres-protocol", "postgres_backend", @@ -3937,7 +3947,7 @@ dependencies = [ "anyhow", "async-stream", "bytes", - "clap 4.3.0", + "clap", "const_format", "futures", "futures-core", @@ -3947,7 +3957,7 @@ dependencies = [ "hyper", "metrics", "once_cell", - "parking_lot", + "parking_lot 0.12.1", "prost", "tokio", "tokio-stream", @@ -4118,12 +4128,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "textwrap" -version = "0.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" - [[package]] name = "thiserror" version = "1.0.40" @@ -4281,7 +4285,7 @@ dependencies = [ "futures-channel", "futures-util", "log", - "parking_lot", + "parking_lot 0.12.1", "percent-encoding", "phf", "pin-project-lite", @@ -4539,7 +4543,7 @@ name = "trace" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.3.0", + "clap", "pageserver_api", "utils", "workspace_hack", @@ -4641,7 +4645,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" dependencies = [ "matchers", - "nu-ansi-term", "once_cell", "regex", "serde", @@ -4810,7 +4813,6 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", - "atty", "bincode", "byteorder", "bytes", @@ -4887,7 +4889,7 @@ name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.3.0", + "clap", "env_logger", "log", "once_cell", @@ -4991,6 +4993,21 @@ version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93" +[[package]] +name = "wasm-timer" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be0ecb0db480561e9a7642b5d3e4187c128914e58aa84330b9493e3eb68c5e7f" +dependencies = [ + "futures", + "js-sys", + "parking_lot 0.11.2", + "pin-utils", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "web-sys" version = "0.3.63" @@ -5252,7 +5269,7 @@ dependencies = [ "anyhow", "bytes", "chrono", - "clap 4.3.0", + "clap", "clap_builder", "crossbeam-utils", "either", diff --git a/Cargo.toml b/Cargo.toml index 551a9dc783..f36e8f6569 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,7 +34,6 @@ license = "Apache-2.0" anyhow = { version = "1.0", features = ["backtrace"] } async-stream = "0.3" async-trait = "0.1" -atty = "0.2.14" aws-config = { version = "0.55", default-features = false, features=["rustls"] } aws-sdk-s3 = "0.27" aws-smithy-http = "0.55" @@ -87,6 +86,7 @@ opentelemetry = "0.18.0" opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions = "0.10.0" parking_lot = "0.12" +pbkdf2 = "0.12.1" pin-project-lite = "0.2" prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency prost = "0.11" @@ -95,6 +95,7 @@ regex = "1.4" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] } reqwest-middleware = "0.2.0" +reqwest-retry = "0.2.2" routerify = "3" rpds = "0.13" rustls = "0.20" @@ -128,7 +129,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]} tracing = "0.1" tracing-error = "0.2.0" tracing-opentelemetry = "0.18.0" -tracing-subscriber = { version = "0.3", features = ["env-filter"] } +tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] } url = "2.2" uuid = { version = "1.2", features = ["v4", "serde"] } walkdir = "2.3.2" @@ -170,7 +171,7 @@ utils = { version = "0.1", path = "./libs/utils/" } workspace_hack = { version = "0.1", path = "./workspace_hack/" } ## Build dependencies -criterion = "0.4" +criterion = "0.5.1" rcgen = "0.10" rstest = "0.17" tempfile = "3.4" diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index d69b2cf174..310e4c32a3 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -189,8 +189,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz - FROM build-deps AS vector-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \ - echo "b76cf84ddad452cc880a6c8c661d137ddd8679c000a16332f4f03ecf6e10bcc8 pgvector.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \ + echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -515,6 +515,26 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4 make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control +######################################################################################### +# +# Layer "pg-anon-pg-build" +# compile anon extension +# +######################################################################################### +FROM build-deps AS pg-anon-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead +ENV PATH "/usr/local/pgsql/bin/:$PATH" +RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \ + echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \ + mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \ + find /usr/local/pgsql -type f | sort > /before.txt && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \ + find /usr/local/pgsql -type f | sort > /after.txt && \ + /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done' + ######################################################################################### # # Layer "rust extensions" @@ -623,6 +643,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz - # ######################################################################################### FROM build-deps AS neon-pg-ext-build +# Public extensions COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=postgis-build /sfcgal/* / COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -698,6 +719,22 @@ RUN rm -r /usr/local/pgsql/include # if they were to be used by other libraries. RUN rm /usr/local/pgsql/lib/lib*.a +######################################################################################### +# +# Extenstion only +# +######################################################################################### +FROM scratch AS postgres-extensions +# After the transition this layer will include all extensitons. +# As for now, it's only for new custom ones +# +# # Default extensions +# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension +# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib /usr/local/pgsql/lib +# Custom extensions +COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib +COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension + ######################################################################################### # # Final layer diff --git a/README.md b/README.md index efa714e5be..1c2452f435 100644 --- a/README.md +++ b/README.md @@ -132,13 +132,13 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r # Create repository in .neon with proper paths to binaries and data # Later that would be responsibility of a package install script > cargo neon init -Starting pageserver at '127.0.0.1:64000' in '.neon'. +Initializing pageserver node 1 at '127.0.0.1:64000' in ".neon" # start pageserver, safekeeper, and broker for their intercommunication > cargo neon start -Starting neon broker at 127.0.0.1:50051 +Starting neon broker at 127.0.0.1:50051. storage_broker started, pid: 2918372 -Starting pageserver at '127.0.0.1:64000' in '.neon'. +Starting pageserver node 1 at '127.0.0.1:64000' in ".neon". pageserver started, pid: 2918386 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'. safekeeper 1 started, pid: 2918437 @@ -152,8 +152,7 @@ Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one # start postgres compute node > cargo neon endpoint start main Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ... -Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432 -Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres' +Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres' # check list of running postgres instances > cargo neon endpoint list @@ -189,18 +188,17 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: # start postgres on that branch > cargo neon endpoint start migration_check --branch-name migration_check Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ... -Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433 -Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres' +Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres' # check the new list of running postgres instances > cargo neon endpoint list ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running - migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running + migration_check 127.0.0.1:55434 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running # this new postgres instance will have all the data from 'main' postgres, # but all modifications would not affect data in original postgres -> psql -p55433 -h 127.0.0.1 -U cloud_admin postgres +> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 90b39e9dd9..68f6bf3844 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -256,6 +256,16 @@ fn main() -> Result<()> { exit_code = ecode.code() } + // Maybe sync safekeepers again, to speed up next startup + let compute_state = compute.state.lock().unwrap().clone(); + let pspec = compute_state.pspec.as_ref().expect("spec must be set"); + if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) { + info!("syncing safekeepers on shutdown"); + let storage_auth_token = pspec.storage_auth_token.clone(); + let lsn = compute.sync_safekeepers(storage_auth_token)?; + info!("synced safekeepers at lsn {lsn}"); + } + if let Err(err) = compute.check_for_core_dumps() { error!("error while checking for core dumps: {err:?}"); } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 6b549e198b..aec4e49725 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -235,7 +235,7 @@ impl ComputeNode { // Get basebackup from the libpq connection to pageserver using `connstr` and // unarchive it to `pgdata` directory overriding all its previous content. - #[instrument(skip(self, compute_state))] + #[instrument(skip_all, fields(%lsn))] fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let spec = compute_state.pspec.as_ref().expect("spec must be set"); let start_time = Utc::now(); @@ -277,8 +277,8 @@ impl ComputeNode { // Run `postgres` in a special mode with `--sync-safekeepers` argument // and return the reported LSN back to the caller. - #[instrument(skip(self, storage_auth_token))] - fn sync_safekeepers(&self, storage_auth_token: Option) -> Result { + #[instrument(skip_all)] + pub fn sync_safekeepers(&self, storage_auth_token: Option) -> Result { let start_time = Utc::now(); let sync_handle = Command::new(&self.pgbin) @@ -322,7 +322,7 @@ impl ComputeNode { /// Do all the preparations like PGDATA directory creation, configuration, /// safekeepers sync, basebackup, etc. - #[instrument(skip(self, compute_state))] + #[instrument(skip_all)] pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); let spec = &pspec.spec; @@ -380,7 +380,7 @@ impl ComputeNode { /// Start Postgres as a child process and manage DBs/roles. /// After that this will hang waiting on the postmaster process to exit. - #[instrument(skip(self))] + #[instrument(skip_all)] pub fn start_postgres( &self, storage_auth_token: Option, @@ -404,7 +404,7 @@ impl ComputeNode { } /// Do initial configuration of the already started Postgres. - #[instrument(skip(self, compute_state))] + #[instrument(skip_all)] pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { // If connection fails, // it may be the old node with `zenith_admin` superuser. @@ -458,7 +458,7 @@ impl ComputeNode { // We could've wrapped this around `pg_ctl reload`, but right now we don't use // `pg_ctl` for start / stop, so this just seems much easier to do as we already // have opened connection to Postgres and superuser access. - #[instrument(skip(self, client))] + #[instrument(skip_all)] fn pg_reload_conf(&self, client: &mut Client) -> Result<()> { client.simple_query("SELECT pg_reload_conf()")?; Ok(()) @@ -466,7 +466,7 @@ impl ComputeNode { /// Similar to `apply_config()`, but does a bit different sequence of operations, /// as it's used to reconfigure a previously started and configured Postgres node. - #[instrument(skip(self))] + #[instrument(skip_all)] pub fn reconfigure(&self) -> Result<()> { let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec; @@ -501,7 +501,7 @@ impl ComputeNode { Ok(()) } - #[instrument(skip(self))] + #[instrument(skip_all)] pub fn start_compute(&self) -> Result { let compute_state = self.state.lock().unwrap().clone(); let pspec = compute_state.pspec.as_ref().expect("spec must be set"); @@ -516,9 +516,9 @@ impl ComputeNode { self.prepare_pgdata(&compute_state)?; let start_time = Utc::now(); - let pg = self.start_postgres(pspec.storage_auth_token.clone())?; + let config_time = Utc::now(); if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates { self.apply_config(&compute_state)?; } @@ -526,11 +526,16 @@ impl ComputeNode { let startup_end_time = Utc::now(); { let mut state = self.state.lock().unwrap(); - state.metrics.config_ms = startup_end_time + state.metrics.start_postgres_ms = config_time .signed_duration_since(start_time) .to_std() .unwrap() .as_millis() as u64; + state.metrics.config_ms = startup_end_time + .signed_duration_since(config_time) + .to_std() + .unwrap() + .as_millis() as u64; state.metrics.total_startup_ms = startup_end_time .signed_duration_since(compute_state.start_time) .to_std() diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs index a07fd0b8cd..13550e0176 100644 --- a/compute_tools/src/configurator.rs +++ b/compute_tools/src/configurator.rs @@ -8,7 +8,7 @@ use compute_api::responses::ComputeStatus; use crate::compute::ComputeNode; -#[instrument(skip(compute))] +#[instrument(skip_all)] fn configurator_main_loop(compute: &Arc) { info!("waiting for reconfiguration requests"); loop { diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index f6fc882968..3ae68de8ef 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -18,6 +18,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level)); let fmt_layer = tracing_subscriber::fmt::layer() + .with_ansi(false) .with_target(false) .with_writer(std::io::stderr); diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index b76ed1fd85..6a78bffd1b 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -215,7 +215,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { /// Wait for Postgres to become ready to accept connections. It's ready to /// accept connections when the state-field in `pgdata/postmaster.pid` says /// 'ready'. -#[instrument(skip(pg))] +#[instrument(skip_all, fields(pgdata = %pgdata.display()))] pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { let pid_path = pgdata.join("postmaster.pid"); diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 1f3f8f45ea..00af1a1d53 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -180,6 +180,11 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any } // Wait until process is gone + wait_until_stopped(process_name, pid)?; + Ok(()) +} + +pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> { for retries in 0..RETRIES { match process_has_stopped(pid) { Ok(true) => { diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 52af936d7b..8995a18564 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -308,7 +308,8 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { let mut env = LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; - env.init(pg_version) + let force = init_match.get_flag("force"); + env.init(pg_version, force) .context("Failed to initialize neon repository")?; // Initialize pageserver, create initial tenant and timeline. @@ -1013,6 +1014,13 @@ fn cli() -> Command { .help("If set, the node will be a hot replica on the specified timeline") .required(false); + let force_arg = Arg::new("force") + .value_parser(value_parser!(bool)) + .long("force") + .action(ArgAction::SetTrue) + .help("Force initialization even if the repository is not empty") + .required(false); + Command::new("Neon CLI") .arg_required_else_help(true) .version(GIT_VERSION) @@ -1028,6 +1036,7 @@ fn cli() -> Command { .value_name("config"), ) .arg(pg_version_arg.clone()) + .arg(force_arg) ) .subcommand( Command::new("timeline") diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 52683ff1c3..ab921d096f 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -405,6 +405,16 @@ impl Endpoint { String::from_utf8_lossy(&pg_ctl.stderr), ); } + + // Also wait for the compute_ctl process to die. It might have some cleanup + // work to do after postgres stops, like syncing safekeepers, etc. + // + // TODO use background_process::stop_process instead + let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); + let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?; + let pid = nix::unistd::Pid::from_raw(pid as i32); + crate::background_process::wait_until_stopped("compute_ctl", pid)?; + Ok(()) } @@ -507,7 +517,13 @@ impl Endpoint { .stdin(std::process::Stdio::null()) .stderr(logfile.try_clone()?) .stdout(logfile); - let _child = cmd.spawn()?; + let child = cmd.spawn()?; + + // Write down the pid so we can wait for it when we want to stop + // TODO use background_process::start_process instead + let pid = child.id(); + let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); + std::fs::write(pidfile_path, pid.to_string())?; // Wait for it to start let mut attempt = 0; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index df70cb3139..208eb9e7ec 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -364,7 +364,7 @@ impl LocalEnv { // // Initialize a new Neon repository // - pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> { + pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> { // check if config already exists let base_path = &self.base_data_dir; ensure!( @@ -372,11 +372,29 @@ impl LocalEnv { "repository base path is missing" ); - ensure!( - !base_path.exists(), - "directory '{}' already exists. Perhaps already initialized?", - base_path.display() - ); + if base_path.exists() { + if force { + println!("removing all contents of '{}'", base_path.display()); + // instead of directly calling `remove_dir_all`, we keep the original dir but removing + // all contents inside. This helps if the developer symbol links another directory (i.e., + // S3 local SSD) to the `.neon` base directory. + for entry in std::fs::read_dir(base_path)? { + let entry = entry?; + let path = entry.path(); + if path.is_dir() { + fs::remove_dir_all(&path)?; + } else { + fs::remove_file(&path)?; + } + } + } else { + bail!( + "directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)", + base_path.display() + ); + } + } + if !self.pg_bin_dir(pg_version)?.join("postgres").exists() { bail!( "Can't find postgres binary at {}", @@ -392,7 +410,9 @@ impl LocalEnv { } } - fs::create_dir(base_path)?; + if !base_path.exists() { + fs::create_dir(base_path)?; + } // Generate keypair for JWT. // diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index ce73dda08a..80e5341216 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -71,6 +71,7 @@ pub struct ComputeMetrics { pub wait_for_spec_ms: u64, pub sync_safekeepers_ms: u64, pub basebackup_ms: u64, + pub start_postgres_ms: u64, pub config_ms: u64, pub total_startup_ms: u64, } diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 6fe65a0362..0917bab39c 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -173,10 +173,15 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,))) .with_context(|| "RemotePath conversion")?; + let path3 = RemotePath::new(&PathBuf::from(format!("{}/path3", ctx.base_prefix,))) + .with_context(|| "RemotePath conversion")?; + let data1 = "remote blob data1".as_bytes(); let data1_len = data1.len(); let data2 = "remote blob data2".as_bytes(); let data2_len = data2.len(); + let data3 = "remote blob data3".as_bytes(); + let data3_len = data3.len(); ctx.client .upload(std::io::Cursor::new(data1), data1_len, &path1, None) .await?; @@ -185,8 +190,18 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> .upload(std::io::Cursor::new(data2), data2_len, &path2, None) .await?; + ctx.client + .upload(std::io::Cursor::new(data3), data3_len, &path3, None) + .await?; + ctx.client.delete_objects(&[path1, path2]).await?; + let prefixes = ctx.client.list_prefixes(None).await?; + + assert_eq!(prefixes.len(), 1); + + ctx.client.delete_objects(&[path3]).await?; + Ok(()) } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 8239ffff57..87b0082356 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -5,7 +5,6 @@ edition.workspace = true license.workspace = true [dependencies] -atty.workspace = true sentry.workspace = true async-trait.workspace = true anyhow.workspace = true diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 2b8c852d86..7b32bf2841 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -84,7 +84,7 @@ pub fn init( let r = r.with({ let log_layer = tracing_subscriber::fmt::layer() .with_target(false) - .with_ansi(atty::is(atty::Stream::Stdout)) + .with_ansi(false) .with_writer(std::io::stdout); let log_layer = match log_format { LogFormat::Json => log_layer.json().boxed(), diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 45dc9fad4a..03bb7a5bfd 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,22 +1,23 @@ use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::repository::Key; use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName}; +use pageserver::tenant::storage_layer::{tests::LayerDescriptor, Layer, LayerFileName}; +use pageserver::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc}; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use std::cmp::{max, min}; use std::fs::File; use std::io::{BufRead, BufReader}; use std::path::PathBuf; use std::str::FromStr; -use std::sync::Arc; use std::time::Instant; +use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -fn build_layer_map(filename_dump: PathBuf) -> LayerMap { - let mut layer_map = LayerMap::::default(); +fn build_layer_map(filename_dump: PathBuf) -> LayerMap { + let mut layer_map = LayerMap::default(); let mut min_lsn = Lsn(u64::MAX); let mut max_lsn = Lsn(0); @@ -33,7 +34,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { min_lsn = min(min_lsn, lsn_range.start); max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1)); - updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer)); + updates.insert_historic(layer.layer_desc().clone()); } println!("min: {min_lsn}, max: {max_lsn}"); @@ -43,7 +44,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { } /// Construct a layer map query pattern for benchmarks -fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> { +fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> { // For each image layer we query one of the pages contained, at LSN right // before the image layer was created. This gives us a somewhat uniform // coverage of both the lsn and key space because image layers have @@ -69,7 +70,7 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn // Construct a partitioning for testing get_difficulty map when we // don't have an exact result of `collect_keyspace` to work with. -fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning { +fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning { let mut parts = Vec::new(); // We add a partition boundary at the start of each image layer, @@ -209,13 +210,15 @@ fn bench_sequential(c: &mut Criterion) { for i in 0..100_000 { let i32 = (i as u32) % 100; let zero = Key::from_hex("000000000000000000000000000000000000").unwrap(); - let layer = LayerDescriptor { - key: zero.add(10 * i32)..zero.add(10 * i32 + 1), - lsn: Lsn(i)..Lsn(i + 1), - is_incremental: false, - short_id: format!("Layer {}", i), - }; - updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer)); + let layer = LayerDescriptor::from(PersistentLayerDesc::new_img( + TenantId::generate(), + TimelineId::generate(), + zero.add(10 * i32)..zero.add(10 * i32 + 1), + Lsn(i), + false, + 0, + )); + updates.insert_historic(layer.layer_desc().clone()); } updates.flush(); println!("Finished layer map init in {:?}", now.elapsed()); diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index ca7b9650e8..f7592afc12 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -24,6 +24,8 @@ const RESIDENT_SIZE: &str = "resident_size"; const REMOTE_STORAGE_SIZE: &str = "remote_storage_size"; const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size"; +const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); + #[serde_as] #[derive(Serialize, Debug)] struct Ids { @@ -73,7 +75,10 @@ pub async fn collect_metrics( ); // define client here to reuse it for all requests - let client = reqwest::Client::new(); + let client = reqwest::ClientBuilder::new() + .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT) + .build() + .expect("Failed to create http client with timeout"); let mut cached_metrics: HashMap = HashMap::new(); let mut prev_iteration_time: std::time::Instant = std::time::Instant::now(); @@ -83,7 +88,7 @@ pub async fn collect_metrics( info!("collect_metrics received cancellation request"); return Ok(()); }, - _ = ticker.tick() => { + tick_at = ticker.tick() => { // send cached metrics every cached_metric_collection_interval let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval; @@ -93,6 +98,12 @@ pub async fn collect_metrics( } collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await; + + crate::tenant::tasks::warn_when_period_overrun( + tick_at.elapsed(), + metric_collection_interval, + "consumption_metrics_collect_metrics", + ); } } } @@ -273,31 +284,42 @@ pub async fn collect_metrics_iteration( }) .expect("PageserverConsumptionMetric should not fail serialization"); - let res = client - .post(metric_collection_endpoint.clone()) - .json(&chunk_json) - .send() - .await; + const MAX_RETRIES: u32 = 3; - match res { - Ok(res) => { - if res.status().is_success() { - // update cached metrics after they were sent successfully - for (curr_key, curr_val) in chunk.iter() { - cached_metrics.insert(curr_key.clone(), *curr_val); - } - } else { - error!("metrics endpoint refused the sent metrics: {:?}", res); - for metric in chunk_to_send.iter() { - // Report if the metric value is suspiciously large - if metric.value > (1u64 << 40) { + for attempt in 0..MAX_RETRIES { + let res = client + .post(metric_collection_endpoint.clone()) + .json(&chunk_json) + .send() + .await; + + match res { + Ok(res) => { + if res.status().is_success() { + // update cached metrics after they were sent successfully + for (curr_key, curr_val) in chunk.iter() { + cached_metrics.insert(curr_key.clone(), *curr_val); + } + } else { + error!("metrics endpoint refused the sent metrics: {:?}", res); + for metric in chunk_to_send + .iter() + .filter(|metric| metric.value > (1u64 << 40)) + { + // Report if the metric value is suspiciously large error!("potentially abnormal metric value: {:?}", metric); } } + break; + } + Err(err) if err.is_timeout() => { + error!(attempt, "timeout sending metrics, retrying immediately"); + continue; + } + Err(err) => { + error!(attempt, ?err, "failed to send metrics"); + break; } - } - Err(err) => { - error!("failed to send metrics: {:?}", err); } } } @@ -317,7 +339,7 @@ pub async fn calculate_synthetic_size_worker( _ = task_mgr::shutdown_watcher() => { return Ok(()); }, - _ = ticker.tick() => { + tick_at = ticker.tick() => { let tenants = match mgr::list_tenants().await { Ok(tenants) => tenants, @@ -343,6 +365,12 @@ pub async fn calculate_synthetic_size_worker( } } + + crate::tenant::tasks::warn_when_period_overrun( + tick_at.elapsed(), + synthetic_size_calculation_interval, + "consumption_metrics_synthetic_size_worker", + ); } } } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 50614653be..290492203a 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -186,10 +186,8 @@ paths: schema: $ref: "#/components/schemas/Error" delete: - description: "Attempts to delete specified timeline. On 500 errors should be retried" + description: "Attempts to delete specified timeline. 500 and 409 errors should be retried" responses: - "200": - description: Ok "400": description: Error when no tenant id found in path or no timeline id content: @@ -214,6 +212,12 @@ paths: application/json: schema: $ref: "#/components/schemas/NotFoundError" + "409": + description: Deletion is already in progress, continue polling + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" "412": description: Tenant is missing, or timeline has children content: @@ -718,6 +722,12 @@ paths: application/json: schema: $ref: "#/components/schemas/ForbiddenError" + "406": + description: Permanently unsatisfiable request, don't retry. + content: + application/json: + schema: + $ref: "#/components/schemas/Error" "409": description: Timeline already exists, creation skipped content: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 8d3bb5552b..58dcbb2aac 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -23,7 +23,6 @@ use super::models::{ TimelineCreateRequest, TimelineGcRequest, TimelineInfo, }; use crate::context::{DownloadBehavior, RequestContext}; -use crate::disk_usage_eviction_task; use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; @@ -35,6 +34,7 @@ use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; use crate::{config::PageServerConf, tenant::mgr}; +use crate::{disk_usage_eviction_task, tenant}; use utils::{ auth::JwtAuth, http::{ @@ -187,6 +187,7 @@ impl From for ApiError { format!("Cannot delete timeline which has child timelines: {children:?}") .into_boxed_str(), ), + a @ AlreadyInProgress => ApiError::Conflict(a.to_string()), Other(e) => ApiError::InternalServerError(e), } } @@ -327,15 +328,22 @@ async fn timeline_create_handler( &ctx, ) .await { - Ok(Some(new_timeline)) => { + Ok(new_timeline) => { // Created. Construct a TimelineInfo for it. let timeline_info = build_timeline_info_common(&new_timeline, &ctx) .await .map_err(ApiError::InternalServerError)?; json_response(StatusCode::CREATED, timeline_info) } - Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists - Err(err) => Err(ApiError::InternalServerError(err)), + Err(tenant::CreateTimelineError::AlreadyExists) => { + json_response(StatusCode::CONFLICT, ()) + } + Err(tenant::CreateTimelineError::AncestorLsn(err)) => { + json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg( + format!("{err:#}") + )) + } + Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)), } } .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 00745143bd..db5bccdbba 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -130,6 +130,79 @@ pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub struct PageCacheMetrics { + pub read_accesses_materialized_page: IntCounter, + pub read_accesses_ephemeral: IntCounter, + pub read_accesses_immutable: IntCounter, + + pub read_hits_ephemeral: IntCounter, + pub read_hits_immutable: IntCounter, + pub read_hits_materialized_page_exact: IntCounter, + pub read_hits_materialized_page_older_lsn: IntCounter, +} + +static PAGE_CACHE_READ_HITS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_page_cache_read_hits_total", + "Number of read accesses to the page cache that hit", + &["key_kind", "hit_kind"] + ) + .expect("failed to define a metric") +}); + +static PAGE_CACHE_READ_ACCESSES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_page_cache_read_accesses_total", + "Number of read accesses to the page cache", + &["key_kind"] + ) + .expect("failed to define a metric") +}); + +pub static PAGE_CACHE: Lazy = Lazy::new(|| PageCacheMetrics { + read_accesses_materialized_page: { + PAGE_CACHE_READ_ACCESSES + .get_metric_with_label_values(&["materialized_page"]) + .unwrap() + }, + + read_accesses_ephemeral: { + PAGE_CACHE_READ_ACCESSES + .get_metric_with_label_values(&["ephemeral"]) + .unwrap() + }, + + read_accesses_immutable: { + PAGE_CACHE_READ_ACCESSES + .get_metric_with_label_values(&["immutable"]) + .unwrap() + }, + + read_hits_ephemeral: { + PAGE_CACHE_READ_HITS + .get_metric_with_label_values(&["ephemeral", "-"]) + .unwrap() + }, + + read_hits_immutable: { + PAGE_CACHE_READ_HITS + .get_metric_with_label_values(&["immutable", "-"]) + .unwrap() + }, + + read_hits_materialized_page_exact: { + PAGE_CACHE_READ_HITS + .get_metric_with_label_values(&["materialized_page", "exact"]) + .unwrap() + }, + + read_hits_materialized_page_older_lsn: { + PAGE_CACHE_READ_HITS + .get_metric_with_label_values(&["materialized_page", "older_lsn"]) + .unwrap() + }, +}); + static WAIT_LSN_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_wait_lsn_seconds", @@ -204,11 +277,11 @@ pub static TENANT_STATE_METRIC: Lazy = Lazy::new(|| { pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy = Lazy::new(|| { register_uint_gauge_vec!( - "pageserver_tenant_synthetic_size", - "Synthetic size of each tenant", + "pageserver_tenant_synthetic_cached_size_bytes", + "Synthetic size of each tenant in bytes", &["tenant_id"] ) - .expect("Failed to register pageserver_tenant_synthetic_size metric") + .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric") }); // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, @@ -968,7 +1041,6 @@ impl RemoteTimelineClientMetrics { op_kind: &RemoteOpKind, status: &'static str, ) -> Histogram { - // XXX would be nice to have an upgradable RwLock let mut guard = self.remote_operation_time.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str(), status); let metric = guard.entry(key).or_insert_with(move || { @@ -990,7 +1062,6 @@ impl RemoteTimelineClientMetrics { file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, ) -> IntGauge { - // XXX would be nice to have an upgradable RwLock let mut guard = self.calls_unfinished_gauge.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str()); let metric = guard.entry(key).or_insert_with(move || { @@ -1011,7 +1082,6 @@ impl RemoteTimelineClientMetrics { file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, ) -> Histogram { - // XXX would be nice to have an upgradable RwLock let mut guard = self.calls_started_hist.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str()); let metric = guard.entry(key).or_insert_with(move || { @@ -1032,7 +1102,6 @@ impl RemoteTimelineClientMetrics { file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, ) -> IntCounter { - // XXX would be nice to have an upgradable RwLock let mut guard = self.bytes_started_counter.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str()); let metric = guard.entry(key).or_insert_with(move || { @@ -1053,7 +1122,6 @@ impl RemoteTimelineClientMetrics { file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, ) -> IntCounter { - // XXX would be nice to have an upgradable RwLock let mut guard = self.bytes_finished_counter.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str()); let metric = guard.entry(key).or_insert_with(move || { diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index d2fe06697e..ef0e748d10 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -313,6 +313,10 @@ impl PageCache { key: &Key, lsn: Lsn, ) -> Option<(Lsn, PageReadGuard)> { + crate::metrics::PAGE_CACHE + .read_accesses_materialized_page + .inc(); + let mut cache_key = CacheKey::MaterializedPage { hash_key: MaterializedPageHashKey { tenant_id, @@ -323,8 +327,21 @@ impl PageCache { }; if let Some(guard) = self.try_lock_for_read(&mut cache_key) { - if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key { - Some((lsn, guard)) + if let CacheKey::MaterializedPage { + hash_key: _, + lsn: available_lsn, + } = cache_key + { + if available_lsn == lsn { + crate::metrics::PAGE_CACHE + .read_hits_materialized_page_exact + .inc(); + } else { + crate::metrics::PAGE_CACHE + .read_hits_materialized_page_older_lsn + .inc(); + } + Some((available_lsn, guard)) } else { panic!("unexpected key type in slot"); } @@ -499,11 +516,31 @@ impl PageCache { /// ``` /// fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result { + let (read_access, hit) = match cache_key { + CacheKey::MaterializedPage { .. } => { + unreachable!("Materialized pages use lookup_materialized_page") + } + CacheKey::EphemeralPage { .. } => ( + &crate::metrics::PAGE_CACHE.read_accesses_ephemeral, + &crate::metrics::PAGE_CACHE.read_hits_ephemeral, + ), + CacheKey::ImmutableFilePage { .. } => ( + &crate::metrics::PAGE_CACHE.read_accesses_immutable, + &crate::metrics::PAGE_CACHE.read_hits_immutable, + ), + }; + read_access.inc(); + + let mut is_first_iteration = true; loop { // First check if the key already exists in the cache. if let Some(read_guard) = self.try_lock_for_read(cache_key) { + if is_first_iteration { + hit.inc(); + } return Ok(ReadBufResult::Found(read_guard)); } + is_first_iteration = false; // Not found. Find a victim buffer let (slot_idx, mut inner) = diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d32518b513..8728559d72 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -904,7 +904,7 @@ where self.check_permission(Some(tenant_id))?; - let lsn = if params.len() == 3 { + let lsn = if params.len() >= 3 { Some( Lsn::from_str(params[2]) .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 998c199ba6..a54cf9f91b 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -887,7 +887,7 @@ impl<'a> DatadirModification<'a> { ctx: &RequestContext, ) -> Result<(), RelationError> { if rel.relnode == 0 { - return Err(RelationError::AlreadyExists); + return Err(RelationError::InvalidRelnode); } // It's possible that this is the first rel for this db in this // tablespace. Create the reldir entry for it if so. diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index d8db12a113..13db38d956 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -506,17 +506,17 @@ pub async fn shutdown_tasks( warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); } } - let completed = tokio::select! { + let join_handle = tokio::select! { biased; - _ = &mut join_handle => { true }, + _ = &mut join_handle => { None }, _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => { // allow some time to elapse before logging to cut down the number of log // lines. info!("waiting for {} to shut down", task.name); - false + Some(join_handle) } }; - if !completed { + if let Some(join_handle) = join_handle { // we never handled this return value, but: // - we don't deschedule which would lead to is_cancelled // - panics are already logged (is_panicked) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 0e8d6b1287..3fa2a4bab4 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -440,8 +440,13 @@ pub enum GetTimelineError { pub enum DeleteTimelineError { #[error("NotFound")] NotFound, + #[error("HasChildren")] HasChildren(Vec), + + #[error("Timeline deletion is already in progress")] + AlreadyInProgress, + #[error(transparent)] Other(#[from] anyhow::Error), } @@ -496,6 +501,16 @@ impl DeletionGuard { } } +#[derive(thiserror::Error, Debug)] +pub enum CreateTimelineError { + #[error("a timeline with the given ID already exists")] + AlreadyExists, + #[error(transparent)] + AncestorLsn(anyhow::Error), + #[error(transparent)] + Other(#[from] anyhow::Error), +} + impl Tenant { /// Yet another helper for timeline initialization. /// Contains the common part of `load_local_timeline` and `load_remote_timeline`. @@ -585,6 +600,7 @@ impl Tenant { .layers .read() .await + .0 .iter_historic_layers() .next() .is_some(), @@ -1369,8 +1385,7 @@ impl Tenant { /// Returns the new timeline ID and reference to its Timeline object. /// /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with - /// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, - /// a new unique ID is generated. + /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists. pub async fn create_timeline( &self, new_timeline_id: TimelineId, @@ -1379,11 +1394,12 @@ impl Tenant { pg_version: u32, broker_client: storage_broker::BrokerClientChannel, ctx: &RequestContext, - ) -> anyhow::Result>> { - anyhow::ensure!( - self.is_active(), - "Cannot create timelines on inactive tenant" - ); + ) -> Result, CreateTimelineError> { + if !self.is_active() { + return Err(CreateTimelineError::Other(anyhow::anyhow!( + "Cannot create timelines on inactive tenant" + ))); + } if let Ok(existing) = self.get_timeline(new_timeline_id, false) { debug!("timeline {new_timeline_id} already exists"); @@ -1403,7 +1419,7 @@ impl Tenant { .context("wait for timeline uploads to complete")?; } - return Ok(None); + return Err(CreateTimelineError::AlreadyExists); } let loaded_timeline = match ancestor_timeline_id { @@ -1418,12 +1434,12 @@ impl Tenant { let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); if ancestor_ancestor_lsn > *lsn { // can we safely just branch from the ancestor instead? - bail!( + return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!( "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", lsn, ancestor_timeline_id, ancestor_ancestor_lsn, - ); + ))); } // Wait for the WAL to arrive and be processed on the parent branch up @@ -1457,7 +1473,7 @@ impl Tenant { })?; } - Ok(Some(loaded_timeline)) + Ok(loaded_timeline) } /// perform one garbage collection iteration, removing old data files from disk. @@ -1755,14 +1771,11 @@ impl Tenant { timeline = Arc::clone(timeline_entry.get()); // Prevent two tasks from trying to delete the timeline at the same time. - delete_lock_guard = - DeletionGuard(Arc::clone(&timeline.delete_lock).try_lock_owned().map_err( - |_| { - DeleteTimelineError::Other(anyhow::anyhow!( - "timeline deletion is already in progress" - )) - }, - )?); + delete_lock_guard = DeletionGuard( + Arc::clone(&timeline.delete_lock) + .try_lock_owned() + .map_err(|_| DeleteTimelineError::AlreadyInProgress)?, + ); // If another task finished the deletion just before we acquired the lock, // return success. @@ -2704,7 +2717,7 @@ impl Tenant { dst_id: TimelineId, start_lsn: Option, ctx: &RequestContext, - ) -> anyhow::Result> { + ) -> Result, CreateTimelineError> { let tl = self .branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx) .await?; @@ -2721,7 +2734,7 @@ impl Tenant { dst_id: TimelineId, start_lsn: Option, ctx: &RequestContext, - ) -> anyhow::Result> { + ) -> Result, CreateTimelineError> { self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx) .await } @@ -2732,7 +2745,7 @@ impl Tenant { dst_id: TimelineId, start_lsn: Option, _ctx: &RequestContext, - ) -> anyhow::Result> { + ) -> Result, CreateTimelineError> { let src_id = src_timeline.timeline_id; // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN @@ -2772,16 +2785,17 @@ impl Tenant { .context(format!( "invalid branch start lsn: less than latest GC cutoff {}", *latest_gc_cutoff_lsn, - ))?; + )) + .map_err(CreateTimelineError::AncestorLsn)?; // and then the planned GC cutoff { let gc_info = src_timeline.gc_info.read().unwrap(); let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); if start_lsn < cutoff { - bail!(format!( + return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!( "invalid branch start lsn: less than planned GC cutoff {cutoff}" - )); + ))); } } @@ -3814,6 +3828,9 @@ mod tests { { Ok(_) => panic!("branching should have failed"), Err(err) => { + let CreateTimelineError::AncestorLsn(err) = err else { + panic!("wrong error type") + }; assert!(err.to_string().contains("invalid branch start lsn")); assert!(err .source() @@ -3843,6 +3860,9 @@ mod tests { { Ok(_) => panic!("branching should have failed"), Err(err) => { + let CreateTimelineError::AncestorLsn(err) = err else { + panic!("wrong error type"); + }; assert!(&err.to_string().contains("invalid branch start lsn")); assert!(&err .source() diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index ca1a71b623..dee02ac433 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -51,25 +51,23 @@ use crate::keyspace::KeyPartitioning; use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; -use anyhow::Context; use anyhow::Result; -use std::collections::HashMap; use std::collections::VecDeque; use std::ops::Range; use std::sync::Arc; use utils::lsn::Lsn; use historic_layer_coverage::BufferedHistoricLayerCoverage; -pub use historic_layer_coverage::Replacement; +pub use historic_layer_coverage::LayerKey; use super::storage_layer::range_eq; use super::storage_layer::PersistentLayerDesc; -use super::storage_layer::PersistentLayerKey; /// /// LayerMap tracks what layers exist on a timeline. /// -pub struct LayerMap { +#[derive(Default)] +pub struct LayerMap { // // 'open_layer' holds the current InMemoryLayer that is accepting new // records. If it is None, 'next_open_layer_at' will be set instead, indicating @@ -95,24 +93,6 @@ pub struct LayerMap { /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. l0_delta_layers: Vec>, - - /// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and - /// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and - /// RemoteLayer will be removed. - mapping: HashMap>, -} - -impl Default for LayerMap { - fn default() -> Self { - Self { - open_layer: None, - next_open_layer_at: None, - frozen_layers: VecDeque::default(), - l0_delta_layers: Vec::default(), - historic: BufferedHistoricLayerCoverage::default(), - mapping: HashMap::default(), - } - } } /// The primary update API for the layer map. @@ -120,24 +100,21 @@ impl Default for LayerMap { /// Batching historic layer insertions and removals is good for /// performance and this struct helps us do that correctly. #[must_use] -pub struct BatchedUpdates<'a, L: ?Sized + Layer> { +pub struct BatchedUpdates<'a> { // While we hold this exclusive reference to the layer map the type checker // will prevent us from accidentally reading any unflushed updates. - layer_map: &'a mut LayerMap, + layer_map: &'a mut LayerMap, } /// Provide ability to batch more updates while hiding the read /// API so we don't accidentally read without flushing. -impl BatchedUpdates<'_, L> -where - L: ?Sized + Layer, -{ +impl BatchedUpdates<'_> { /// /// Insert an on-disk layer. /// // TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap` - pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc) { - self.layer_map.insert_historic_noflush(layer_desc, layer) + pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc) { + self.layer_map.insert_historic_noflush(layer_desc) } /// @@ -145,31 +122,8 @@ where /// /// This should be called when the corresponding file on disk has been deleted. /// - pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc) { - self.layer_map.remove_historic_noflush(layer_desc, layer) - } - - /// Replaces existing layer iff it is the `expected`. - /// - /// If the expected layer has been removed it will not be inserted by this function. - /// - /// Returned `Replacement` describes succeeding in replacement or the reason why it could not - /// be done. - /// - /// TODO replacement can be done without buffering and rebuilding layer map updates. - /// One way to do that is to add a layer of indirection for returned values, so - /// that we can replace values only by updating a hashmap. - pub fn replace_historic( - &mut self, - expected_desc: PersistentLayerDesc, - expected: &Arc, - new_desc: PersistentLayerDesc, - new: Arc, - ) -> anyhow::Result>> { - fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound)); - - self.layer_map - .replace_historic_noflush(expected_desc, expected, new_desc, new) + pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) { + self.layer_map.remove_historic_noflush(layer_desc) } // We will flush on drop anyway, but this method makes it @@ -185,25 +139,19 @@ where // than panic later or read without flushing. // // TODO maybe warn if flush hasn't explicitly been called -impl Drop for BatchedUpdates<'_, L> -where - L: ?Sized + Layer, -{ +impl Drop for BatchedUpdates<'_> { fn drop(&mut self) { self.layer_map.flush_updates(); } } /// Return value of LayerMap::search -pub struct SearchResult { - pub layer: Arc, +pub struct SearchResult { + pub layer: Arc, pub lsn_floor: Lsn, } -impl LayerMap -where - L: ?Sized + Layer, -{ +impl LayerMap { /// /// Find the latest layer (by lsn.end) that covers the given /// 'key', with lsn.start < 'end_lsn'. @@ -235,7 +183,7 @@ where /// NOTE: This only searches the 'historic' layers, *not* the /// 'open' and 'frozen' layers! /// - pub fn search(&self, key: Key, end_lsn: Lsn) -> Option> { + pub fn search(&self, key: Key, end_lsn: Lsn) -> Option { let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?; let latest_delta = version.delta_coverage.query(key.to_i128()); let latest_image = version.image_coverage.query(key.to_i128()); @@ -244,7 +192,6 @@ where (None, None) => None, (None, Some(image)) => { let lsn_floor = image.get_lsn_range().start; - let image = self.get_layer_from_mapping(&image.key()).clone(); Some(SearchResult { layer: image, lsn_floor, @@ -252,7 +199,6 @@ where } (Some(delta), None) => { let lsn_floor = delta.get_lsn_range().start; - let delta = self.get_layer_from_mapping(&delta.key()).clone(); Some(SearchResult { layer: delta, lsn_floor, @@ -263,7 +209,6 @@ where let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end; let image_exact_match = img_lsn + 1 == end_lsn; if image_is_newer || image_exact_match { - let image = self.get_layer_from_mapping(&image.key()).clone(); Some(SearchResult { layer: image, lsn_floor: img_lsn, @@ -271,7 +216,6 @@ where } else { let lsn_floor = std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); - let delta = self.get_layer_from_mapping(&delta.key()).clone(); Some(SearchResult { layer: delta, lsn_floor, @@ -282,7 +226,7 @@ where } /// Start a batch of updates, applied on drop - pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> { + pub fn batch_update(&mut self) -> BatchedUpdates<'_> { BatchedUpdates { layer_map: self } } @@ -292,48 +236,32 @@ where /// Helper function for BatchedUpdates::insert_historic /// /// TODO(chi): remove L generic so that we do not need to pass layer object. - pub(self) fn insert_historic_noflush( - &mut self, - layer_desc: PersistentLayerDesc, - layer: Arc, - ) { - self.mapping.insert(layer_desc.key(), layer.clone()); - + pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) { // TODO: See #3869, resulting #4088, attempted fix and repro #4094 - if Self::is_l0(&layer) { + if Self::is_l0(&layer_desc) { self.l0_delta_layers.push(layer_desc.clone().into()); } self.historic.insert( - historic_layer_coverage::LayerKey::from(&*layer), + historic_layer_coverage::LayerKey::from(&layer_desc), layer_desc.into(), ); } - fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc { - let layer = self - .mapping - .get(key) - .with_context(|| format!("{key:?}")) - .expect("inconsistent layer mapping"); - layer - } - /// /// Remove an on-disk layer from the map. /// /// Helper function for BatchedUpdates::remove_historic /// - pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc) { + pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) { self.historic - .remove(historic_layer_coverage::LayerKey::from(&*layer)); - if Self::is_l0(&layer) { + .remove(historic_layer_coverage::LayerKey::from(&layer_desc)); + let layer_key = layer_desc.key(); + if Self::is_l0(&layer_desc) { let len_before = self.l0_delta_layers.len(); let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers); - l0_delta_layers.retain(|other| { - !Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer) - }); + l0_delta_layers.retain(|other| other.key() != layer_key); self.l0_delta_layers = l0_delta_layers; // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers, // there's a chance that the comparison fails at runtime due to it comparing (pointer, @@ -344,69 +272,6 @@ where "failed to locate removed historic layer from l0_delta_layers" ); } - self.mapping.remove(&layer_desc.key()); - } - - pub(self) fn replace_historic_noflush( - &mut self, - expected_desc: PersistentLayerDesc, - expected: &Arc, - new_desc: PersistentLayerDesc, - new: Arc, - ) -> anyhow::Result>> { - let key = historic_layer_coverage::LayerKey::from(&**expected); - let other = historic_layer_coverage::LayerKey::from(&*new); - - let expected_l0 = Self::is_l0(expected); - let new_l0 = Self::is_l0(&new); - - anyhow::ensure!( - key == other, - "expected and new must have equal LayerKeys: {key:?} != {other:?}" - ); - - anyhow::ensure!( - expected_l0 == new_l0, - "expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}" - ); - - let l0_index = if expected_l0 { - // find the index in case replace worked, we need to replace that as well - let pos = self.l0_delta_layers.iter().position(|slot| { - Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected) - }); - - if pos.is_none() { - return Ok(Replacement::NotFound); - } - pos - } else { - None - }; - - let new_desc = Arc::new(new_desc); - let replaced = self.historic.replace(&key, new_desc.clone(), |existing| { - **existing == expected_desc - }); - - if let Replacement::Replaced { .. } = &replaced { - self.mapping.remove(&expected_desc.key()); - self.mapping.insert(new_desc.key(), new); - if let Some(index) = l0_index { - self.l0_delta_layers[index] = new_desc; - } - } - - let replaced = match replaced { - Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered }, - Replacement::NotFound => Replacement::NotFound, - Replacement::RemovalBuffered => Replacement::RemovalBuffered, - Replacement::Unexpected(x) => { - Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone()) - } - }; - - Ok(replaced) } /// Helper function for BatchedUpdates::drop. @@ -454,10 +319,8 @@ where Ok(true) } - pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { - self.historic - .iter() - .map(|x| self.get_layer_from_mapping(&x.key()).clone()) + pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { + self.historic.iter() } /// @@ -472,7 +335,7 @@ where &self, key_range: &Range, lsn: Lsn, - ) -> Result, Option>)>> { + ) -> Result, Option>)>> { let version = match self.historic.get().unwrap().get_version(lsn.0) { Some(v) => v, None => return Ok(vec![]), @@ -482,36 +345,26 @@ where let end = key_range.end.to_i128(); // Initialize loop variables - let mut coverage: Vec<(Range, Option>)> = vec![]; + let mut coverage: Vec<(Range, Option>)> = vec![]; let mut current_key = start; let mut current_val = version.image_coverage.query(start); // Loop through the change events and push intervals for (change_key, change_val) in version.image_coverage.range(start..end) { let kr = Key::from_i128(current_key)..Key::from_i128(change_key); - coverage.push(( - kr, - current_val - .take() - .map(|l| self.get_layer_from_mapping(&l.key()).clone()), - )); + coverage.push((kr, current_val.take())); current_key = change_key; current_val = change_val.clone(); } // Add the final interval let kr = Key::from_i128(current_key)..Key::from_i128(end); - coverage.push(( - kr, - current_val - .take() - .map(|l| self.get_layer_from_mapping(&l.key()).clone()), - )); + coverage.push((kr, current_val.take())); Ok(coverage) } - pub fn is_l0(layer: &L) -> bool { + pub fn is_l0(layer: &PersistentLayerDesc) -> bool { range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX)) } @@ -537,7 +390,7 @@ where /// TODO The optimal number should probably be slightly higher than 1, but to /// implement that we need to plumb a lot more context into this function /// than just the current partition_range. - pub fn is_reimage_worthy(layer: &L, partition_range: &Range) -> bool { + pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range) -> bool { // Case 1 if !Self::is_l0(layer) { return true; @@ -595,9 +448,7 @@ where let kr = Key::from_i128(current_key)..Key::from_i128(change_key); let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = - Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key) - as usize; + let base_count = Self::is_reimage_worthy(&val, key) as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?; @@ -620,9 +471,7 @@ where let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = - Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key) - as usize; + let base_count = Self::is_reimage_worthy(&val, key) as usize; let new_limit = limit.map(|l| l - base_count); let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?; max_stacked_deltas = std::cmp::max( @@ -772,12 +621,8 @@ where } /// Return all L0 delta layers - pub fn get_level0_deltas(&self) -> Result>> { - Ok(self - .l0_delta_layers - .iter() - .map(|x| self.get_layer_from_mapping(&x.key()).clone()) - .collect()) + pub fn get_level0_deltas(&self) -> Result>> { + Ok(self.l0_delta_layers.to_vec()) } /// debugging function to print out the contents of the layer map @@ -802,72 +647,51 @@ where println!("End dump LayerMap"); Ok(()) } - - /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables. - /// - /// Returns `true` if the two `Arc` point to the same layer, false otherwise. - #[inline(always)] - pub fn compare_arced_layers(left: &Arc, right: &Arc) -> bool { - // "dyn Trait" objects are "fat pointers" in that they have two components: - // - pointer to the object - // - pointer to the vtable - // - // rust does not provide a guarantee that these vtables are unique, but however - // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the - // pointer and the vtable need to be equal. - // - // See: https://github.com/rust-lang/rust/issues/103763 - // - // A future version of rust will most likely use this form below, where we cast each - // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it - // not affect the comparison. - // - // See: https://github.com/rust-lang/rust/pull/106450 - let left = Arc::as_ptr(left) as *const (); - let right = Arc::as_ptr(right) as *const (); - - left == right - } } #[cfg(test)] mod tests { - use super::{LayerMap, Replacement}; - use crate::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName}; + use super::LayerMap; + use crate::tenant::storage_layer::{tests::LayerDescriptor, LayerFileName}; use std::str::FromStr; use std::sync::Arc; mod l0_delta_layers_updated { + use crate::tenant::{ + storage_layer::{PersistentLayer, PersistentLayerDesc}, + timeline::LayerFileManager, + }; + use super::*; #[test] fn for_full_range_delta() { // l0_delta_layers are used by compaction, and should observe all buffered updates l0_delta_layers_updated_scenario( - "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69", - true - ) + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69", + true + ) } #[test] fn for_non_full_range_delta() { // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta l0_delta_layers_updated_scenario( - "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69", - // because not full range - false - ) + "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69", + // because not full range + false + ) } #[test] fn for_image() { l0_delta_layers_updated_scenario( - "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69", - // code only checks if it is a full range layer, doesn't care about images, which must - // mean we should in practice never have full range images - false - ) + "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69", + // code only checks if it is a full range layer, doesn't care about images, which must + // mean we should in practice never have full range images + false + ) } #[test] @@ -883,16 +707,16 @@ mod tests { let not_found = Arc::new(layer.clone()); let new_version = Arc::new(layer); - let mut map = LayerMap::default(); + // after the immutable storage state refactor, the replace operation + // will not use layer map any more. We keep it here for consistency in test cases + // and can remove it in the future. + let _map = LayerMap::default(); - let res = map.batch_update().replace_historic( - not_found.get_persistent_layer_desc(), - ¬_found, - new_version.get_persistent_layer_desc(), - new_version, - ); + let mut mapping = LayerFileManager::new(); - assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}"); + mapping + .replace_and_verify(not_found, new_version) + .unwrap_err(); } fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) { @@ -903,49 +727,44 @@ mod tests { let downloaded = Arc::new(skeleton); let mut map = LayerMap::default(); + let mut mapping = LayerFileManager::new(); // two disjoint Arcs in different lifecycle phases. even if it seems they must be the // same layer, we use LayerMap::compare_arced_layers as the identity of layers. - assert!(!LayerMap::compare_arced_layers(&remote, &downloaded)); + assert_eq!(remote.layer_desc(), downloaded.layer_desc()); let expected_in_counts = (1, usize::from(expected_l0)); map.batch_update() - .insert_historic(remote.get_persistent_layer_desc(), remote.clone()); - assert_eq!(count_layer_in(&map, &remote), expected_in_counts); - - let replaced = map - .batch_update() - .replace_historic( - remote.get_persistent_layer_desc(), - &remote, - downloaded.get_persistent_layer_desc(), - downloaded.clone(), - ) - .expect("name derived attributes are the same"); - assert!( - matches!(replaced, Replacement::Replaced { .. }), - "{replaced:?}" + .insert_historic(remote.layer_desc().clone()); + mapping.insert(remote.clone()); + assert_eq!( + count_layer_in(&map, remote.layer_desc()), + expected_in_counts + ); + + mapping + .replace_and_verify(remote, downloaded.clone()) + .expect("name derived attributes are the same"); + assert_eq!( + count_layer_in(&map, downloaded.layer_desc()), + expected_in_counts ); - assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts); map.batch_update() - .remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone()); - assert_eq!(count_layer_in(&map, &downloaded), (0, 0)); + .remove_historic(downloaded.layer_desc().clone()); + assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0)); } - fn count_layer_in(map: &LayerMap, layer: &Arc) -> (usize, usize) { + fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) { let historic = map .iter_historic_layers() - .filter(|x| LayerMap::compare_arced_layers(x, layer)) + .filter(|x| x.key() == layer.key()) .count(); let l0s = map .get_level0_deltas() .expect("why does this return a result"); - let l0 = l0s - .iter() - .filter(|x| LayerMap::compare_arced_layers(x, layer)) - .count(); + let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count(); (historic, l0) } diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index 49dcbc63c2..0f51597027 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -3,6 +3,8 @@ use std::ops::Range; use tracing::info; +use crate::tenant::storage_layer::PersistentLayerDesc; + use super::layer_coverage::LayerCoverageTuple; /// Layers in this module are identified and indexed by this data. @@ -41,8 +43,8 @@ impl Ord for LayerKey { } } -impl<'a, L: crate::tenant::storage_layer::Layer + ?Sized> From<&'a L> for LayerKey { - fn from(layer: &'a L) -> Self { +impl From<&PersistentLayerDesc> for LayerKey { + fn from(layer: &PersistentLayerDesc) -> Self { let kr = layer.get_key_range(); let lr = layer.get_lsn_range(); LayerKey { @@ -454,59 +456,6 @@ impl BufferedHistoricLayerCoverage { self.buffer.insert(layer_key, None); } - /// Replaces a previous layer with a new layer value. - /// - /// The replacement is conditional on: - /// - there is an existing `LayerKey` record - /// - there is no buffered removal for the given `LayerKey` - /// - the given closure returns true for the current `Value` - /// - /// The closure is used to compare the latest value (buffered insert, or existing layer) - /// against some expectation. This allows to use `Arc::ptr_eq` or similar which would be - /// inaccessible via `PartialEq` trait. - /// - /// Returns a `Replacement` value describing the outcome; only the case of - /// `Replacement::Replaced` modifies the map and requires a rebuild. - pub fn replace( - &mut self, - layer_key: &LayerKey, - new: Value, - check_expected: F, - ) -> Replacement - where - F: FnOnce(&Value) -> bool, - { - let (slot, in_buffered) = match self.buffer.get(layer_key) { - Some(inner @ Some(_)) => { - // we compare against the buffered version, because there will be a later - // rebuild before querying - (inner.as_ref(), true) - } - Some(None) => { - // buffer has removal for this key; it will not be equivalent by any check_expected. - return Replacement::RemovalBuffered; - } - None => { - // no pending modification for the key, check layers - (self.layers.get(layer_key), false) - } - }; - - match slot { - Some(existing) if !check_expected(existing) => { - // unfortunate clone here, but otherwise the nll borrowck grows the region of - // 'a to cover the whole function, and we could not mutate in the other - // Some(existing) branch - Replacement::Unexpected(existing.clone()) - } - None => Replacement::NotFound, - Some(_existing) => { - self.insert(layer_key.to_owned(), new); - Replacement::Replaced { in_buffered } - } - } - } - pub fn rebuild(&mut self) { // Find the first LSN that needs to be rebuilt let rebuild_since: u64 = match self.buffer.iter().next() { @@ -575,22 +524,6 @@ impl BufferedHistoricLayerCoverage { } } -/// Outcome of the replace operation. -#[derive(Debug)] -pub enum Replacement { - /// Previous value was replaced with the new value. - Replaced { - /// Replacement happened for a scheduled insert. - in_buffered: bool, - }, - /// Key was not found buffered updates or existing layers. - NotFound, - /// Key has been scheduled for removal, it was not replaced. - RemovalBuffered, - /// Previous value was rejected by the closure. - Unexpected(Value), -} - #[test] fn test_retroactive_regression_1() { let mut map = BufferedHistoricLayerCoverage::new(); @@ -699,139 +632,3 @@ fn test_retroactive_simple() { assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string())); } } - -#[test] -fn test_retroactive_replacement() { - let mut map = BufferedHistoricLayerCoverage::new(); - - let keys = [ - LayerKey { - key: 0..5, - lsn: 100..101, - is_image: true, - }, - LayerKey { - key: 3..9, - lsn: 110..111, - is_image: true, - }, - LayerKey { - key: 4..6, - lsn: 120..121, - is_image: true, - }, - ]; - - let layers = [ - "Image 1".to_string(), - "Image 2".to_string(), - "Image 3".to_string(), - ]; - - for (key, layer) in keys.iter().zip(layers.iter()) { - map.insert(key.to_owned(), layer.to_owned()); - } - - // rebuild is not necessary here, because replace works for both buffered updates and existing - // layers. - - for (key, orig_layer) in keys.iter().zip(layers.iter()) { - let replacement = format!("Remote {orig_layer}"); - - // evict - let ret = map.replace(key, replacement.clone(), |l| l == orig_layer); - assert!( - matches!(ret, Replacement::Replaced { .. }), - "replace {orig_layer}: {ret:?}" - ); - map.rebuild(); - - let at = key.lsn.end + 1; - - let version = map.get().expect("rebuilt").get_version(at).unwrap(); - assert_eq!( - version.image_coverage.query(4).as_deref(), - Some(replacement.as_str()), - "query for 4 at version {at} after eviction", - ); - - // download - let ret = map.replace(key, orig_layer.clone(), |l| l == &replacement); - assert!( - matches!(ret, Replacement::Replaced { .. }), - "replace {orig_layer} back: {ret:?}" - ); - map.rebuild(); - let version = map.get().expect("rebuilt").get_version(at).unwrap(); - assert_eq!( - version.image_coverage.query(4).as_deref(), - Some(orig_layer.as_str()), - "query for 4 at version {at} after download", - ); - } -} - -#[test] -fn missing_key_is_not_inserted_with_replace() { - let mut map = BufferedHistoricLayerCoverage::new(); - let key = LayerKey { - key: 0..5, - lsn: 100..101, - is_image: true, - }; - - let ret = map.replace(&key, "should not replace", |_| true); - assert!(matches!(ret, Replacement::NotFound), "{ret:?}"); - map.rebuild(); - assert!(map - .get() - .expect("no changes to rebuild") - .get_version(102) - .is_none()); -} - -#[test] -fn replacing_buffered_insert_and_remove() { - let mut map = BufferedHistoricLayerCoverage::new(); - let key = LayerKey { - key: 0..5, - lsn: 100..101, - is_image: true, - }; - - map.insert(key.clone(), "Image 1"); - let ret = map.replace(&key, "Remote Image 1", |&l| l == "Image 1"); - assert!( - matches!(ret, Replacement::Replaced { in_buffered: true }), - "{ret:?}" - ); - map.rebuild(); - - assert_eq!( - map.get() - .expect("rebuilt") - .get_version(102) - .unwrap() - .image_coverage - .query(4), - Some("Remote Image 1") - ); - - map.remove(key.clone()); - let ret = map.replace(&key, "should not replace", |_| true); - assert!( - matches!(ret, Replacement::RemovalBuffered), - "cannot replace after scheduled remove: {ret:?}" - ); - - map.rebuild(); - - let ret = map.replace(&key, "should not replace", |_| true); - assert!( - matches!(ret, Replacement::NotFound), - "cannot replace after remove + rebuild: {ret:?}" - ); - - let at_version = map.get().expect("rebuilt").get_version(102); - assert!(at_version.is_none()); -} diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 7808b64d35..190512f48f 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -862,10 +862,8 @@ impl RemoteTimelineClient { "Found {} files not bound to index_file.json, proceeding with their deletion", remaining.len() ); - for file in remaining { - warn!("Removing {}", file.object_name().unwrap_or_default()); - self.storage_impl.delete(&file).await?; - } + warn!("About to remove {} files", remaining.len()); + self.storage_impl.delete_objects(&remaining).await?; } let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME)); diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 0af3d4ce39..7bc513b3a1 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -176,13 +176,10 @@ impl LayerAccessStats { /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status. /// /// See [`record_residence_event`] for why you need to do this while holding the layer map lock. - pub(crate) fn for_loading_layer( - layer_map_lock_held_witness: &BatchedUpdates<'_, L>, + pub(crate) fn for_loading_layer( + layer_map_lock_held_witness: &BatchedUpdates<'_>, status: LayerResidenceStatus, - ) -> Self - where - L: ?Sized + Layer, - { + ) -> Self { let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default())); new.record_residence_event( layer_map_lock_held_witness, @@ -197,14 +194,11 @@ impl LayerAccessStats { /// The `new_status` is not recorded in `self`. /// /// See [`record_residence_event`] for why you need to do this while holding the layer map lock. - pub(crate) fn clone_for_residence_change( + pub(crate) fn clone_for_residence_change( &self, - layer_map_lock_held_witness: &BatchedUpdates<'_, L>, + layer_map_lock_held_witness: &BatchedUpdates<'_>, new_status: LayerResidenceStatus, - ) -> LayerAccessStats - where - L: ?Sized + Layer, - { + ) -> LayerAccessStats { let clone = { let inner = self.0.lock().unwrap(); inner.clone() @@ -232,14 +226,12 @@ impl LayerAccessStats { /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock. /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event. /// - pub(crate) fn record_residence_event( + pub(crate) fn record_residence_event( &self, - _layer_map_lock_held_witness: &BatchedUpdates<'_, L>, + _layer_map_lock_held_witness: &BatchedUpdates<'_>, status: LayerResidenceStatus, reason: LayerResidenceEventReason, - ) where - L: ?Sized + Layer, - { + ) { let mut locked = self.0.lock().unwrap(); locked.iter_mut().for_each(|inner| { inner @@ -473,94 +465,125 @@ pub fn downcast_remote_layer( } } -/// Holds metadata about a layer without any content. Used mostly for testing. -/// -/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a -/// LayerDescriptor. -#[derive(Clone, Debug)] -pub struct LayerDescriptor { - pub key: Range, - pub lsn: Range, - pub is_incremental: bool, - pub short_id: String, -} +pub mod tests { + use super::*; -impl LayerDescriptor { - /// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta, - /// and the tenant / timeline id does not matter. - pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc { - PersistentLayerDesc::new_delta( - TenantId::from_array([0; 16]), - TimelineId::from_array([0; 16]), - self.key.clone(), - self.lsn.clone(), - 233, - ) - } -} - -impl Layer for LayerDescriptor { - fn get_key_range(&self) -> Range { - self.key.clone() + /// Holds metadata about a layer without any content. Used mostly for testing. + /// + /// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a + /// LayerDescriptor. + #[derive(Clone, Debug)] + pub struct LayerDescriptor { + base: PersistentLayerDesc, } - fn get_lsn_range(&self) -> Range { - self.lsn.clone() - } - - fn is_incremental(&self) -> bool { - self.is_incremental - } - - fn get_value_reconstruct_data( - &self, - _key: Key, - _lsn_range: Range, - _reconstruct_data: &mut ValueReconstructState, - _ctx: &RequestContext, - ) -> Result { - todo!("This method shouldn't be part of the Layer trait") - } - - fn short_id(&self) -> String { - self.short_id.clone() - } - - fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> { - todo!() - } -} - -impl From for LayerDescriptor { - fn from(value: DeltaFileName) -> Self { - let short_id = value.to_string(); - LayerDescriptor { - key: value.key_range, - lsn: value.lsn_range, - is_incremental: true, - short_id, + impl From for LayerDescriptor { + fn from(base: PersistentLayerDesc) -> Self { + Self { base } } } -} -impl From for LayerDescriptor { - fn from(value: ImageFileName) -> Self { - let short_id = value.to_string(); - let lsn = value.lsn_as_range(); - LayerDescriptor { - key: value.key_range, - lsn, - is_incremental: false, - short_id, + impl Layer for LayerDescriptor { + fn get_value_reconstruct_data( + &self, + _key: Key, + _lsn_range: Range, + _reconstruct_data: &mut ValueReconstructState, + _ctx: &RequestContext, + ) -> Result { + todo!("This method shouldn't be part of the Layer trait") + } + + fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> { + todo!() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn get_key_range(&self) -> Range { + self.layer_desc().key_range.clone() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn get_lsn_range(&self) -> Range { + self.layer_desc().lsn_range.clone() + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn is_incremental(&self) -> bool { + self.layer_desc().is_incremental + } + + /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. + fn short_id(&self) -> String { + self.layer_desc().short_id() } } -} -impl From for LayerDescriptor { - fn from(value: LayerFileName) -> Self { - match value { - LayerFileName::Delta(d) => Self::from(d), - LayerFileName::Image(i) => Self::from(i), + impl PersistentLayer for LayerDescriptor { + fn layer_desc(&self) -> &PersistentLayerDesc { + &self.base + } + + fn local_path(&self) -> Option { + unimplemented!() + } + + fn iter(&self, _: &RequestContext) -> Result> { + unimplemented!() + } + + fn key_iter(&self, _: &RequestContext) -> Result> { + unimplemented!() + } + + fn delete_resident_layer_file(&self) -> Result<()> { + unimplemented!() + } + + fn info(&self, _: LayerAccessStatsReset) -> HistoricLayerInfo { + unimplemented!() + } + + fn access_stats(&self) -> &LayerAccessStats { + unimplemented!() + } + } + + impl From for LayerDescriptor { + fn from(value: DeltaFileName) -> Self { + LayerDescriptor { + base: PersistentLayerDesc::new_delta( + TenantId::from_array([0; 16]), + TimelineId::from_array([0; 16]), + value.key_range, + value.lsn_range, + 233, + ), + } + } + } + + impl From for LayerDescriptor { + fn from(value: ImageFileName) -> Self { + LayerDescriptor { + base: PersistentLayerDesc::new_img( + TenantId::from_array([0; 16]), + TimelineId::from_array([0; 16]), + value.key_range, + value.lsn, + false, + 233, + ), + } + } + } + + impl From for LayerDescriptor { + fn from(value: LayerFileName) -> Self { + match value { + LayerFileName::Delta(d) => Self::from(d), + LayerFileName::Image(i) => Self::from(i), + } } } } diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 387bae5b1f..9d423ed815 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -218,15 +218,12 @@ impl RemoteLayer { } /// Create a Layer struct representing this layer, after it has been downloaded. - pub fn create_downloaded_layer( + pub fn create_downloaded_layer( &self, - layer_map_lock_held_witness: &BatchedUpdates<'_, L>, + layer_map_lock_held_witness: &BatchedUpdates<'_>, conf: &'static PageServerConf, file_size: u64, - ) -> Arc - where - L: ?Sized + Layer, - { + ) -> Arc { if self.desc.is_delta { let fname = self.desc.delta_file_name(); Arc::new(DeltaLayer::new( diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 060000a01a..39c72a7e47 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3,7 +3,7 @@ mod eviction_task; mod walreceiver; -use anyhow::{anyhow, bail, ensure, Context}; +use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::Bytes; use fail::fail_point; use futures::StreamExt; @@ -85,7 +85,9 @@ use super::config::TenantConf; use super::layer_map::BatchedUpdates; use super::remote_timeline_client::index::IndexPart; use super::remote_timeline_client::RemoteTimelineClient; -use super::storage_layer::{DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset}; +use super::storage_layer::{ + DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc, PersistentLayerKey, +}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub(super) enum FlushLoopState { @@ -118,6 +120,92 @@ impl PartialOrd for Hole { } } +pub struct LayerFileManager(HashMap>); + +impl LayerFileManager { + fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc { + // The assumption for the `expect()` is that all code maintains the following invariant: + // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. + self.0 + .get(&desc.key()) + .with_context(|| format!("get layer from desc: {}", desc.filename().file_name())) + .expect("not found") + .clone() + } + + pub(crate) fn insert(&mut self, layer: Arc) { + let present = self.0.insert(layer.layer_desc().key(), layer.clone()); + if present.is_some() && cfg!(debug_assertions) { + panic!("overwriting a layer: {:?}", layer.layer_desc()) + } + } + + pub(crate) fn new() -> Self { + Self(HashMap::new()) + } + + pub(crate) fn remove(&mut self, layer: Arc) { + let present = self.0.remove(&layer.layer_desc().key()); + if present.is_none() && cfg!(debug_assertions) { + panic!( + "removing layer that is not present in layer mapping: {:?}", + layer.layer_desc() + ) + } + } + + pub(crate) fn replace_and_verify( + &mut self, + expected: Arc, + new: Arc, + ) -> Result<()> { + let key = expected.layer_desc().key(); + let other = new.layer_desc().key(); + + let expected_l0 = LayerMap::is_l0(expected.layer_desc()); + let new_l0 = LayerMap::is_l0(new.layer_desc()); + + fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!( + "layermap-replace-notfound" + )); + + anyhow::ensure!( + key == other, + "expected and new layer have different keys: {key:?} != {other:?}" + ); + + anyhow::ensure!( + expected_l0 == new_l0, + "one layer is l0 while the other is not: {expected_l0} != {new_l0}" + ); + + if let Some(layer) = self.0.get_mut(&expected.layer_desc().key()) { + anyhow::ensure!( + compare_arced_layers(&expected, layer), + "another layer was found instead of expected, expected={expected:?}, new={new:?}", + expected = Arc::as_ptr(&expected), + new = Arc::as_ptr(layer), + ); + *layer = new; + Ok(()) + } else { + anyhow::bail!("layer was not found"); + } + } +} + +/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things. +/// Can be removed after all refactors are done. +fn drop_rlock(rlock: tokio::sync::OwnedRwLockReadGuard) { + drop(rlock) +} + +/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things. +/// Can be removed after all refactors are done. +fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { + drop(rlock) +} + pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, @@ -129,7 +217,24 @@ pub struct Timeline { pub pg_version: u32, - pub(crate) layers: Arc>>, + /// The tuple has two elements. + /// 1. `LayerFileManager` keeps track of the various physical representations of the layer files (inmem, local, remote). + /// 2. `LayerMap`, the acceleration data structure for `get_reconstruct_data`. + /// + /// `LayerMap` maps out the `(PAGE,LSN) / (KEY,LSN)` space, which is composed of `(KeyRange, LsnRange)` rectangles. + /// We describe these rectangles through the `PersistentLayerDesc` struct. + /// + /// When we want to reconstruct a page, we first find the `PersistentLayerDesc`'s that we need for page reconstruction, + /// using `LayerMap`. Then, we use `LayerFileManager` to get the `PersistentLayer`'s that correspond to the + /// `PersistentLayerDesc`'s. + /// + /// Hence, it's important to keep things coherent. The `LayerFileManager` must always have an entry for all + /// `PersistentLayerDesc`'s in the `LayerMap`. If it doesn't, `LayerFileManager::get_from_desc` will panic at + /// runtime, e.g., during page reconstruction. + /// + /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`, + /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`. + pub(crate) layers: Arc>, /// Set of key ranges which should be covered by image layers to /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored. @@ -599,7 +704,8 @@ impl Timeline { /// This method makes no distinction between local and remote layers. /// Hence, the result **does not represent local filesystem usage**. pub async fn layer_size_sum(&self) -> u64 { - let layer_map = self.layers.read().await; + let guard = self.layers.read().await; + let (layer_map, _) = &*guard; let mut size = 0; for l in layer_map.iter_historic_layers() { size += l.file_size(); @@ -909,7 +1015,8 @@ impl Timeline { pub async fn check_checkpoint_distance(self: &Arc) -> anyhow::Result<()> { let last_lsn = self.get_last_record_lsn(); let open_layer_size = { - let layers = self.layers.read().await; + let guard = self.layers.read().await; + let (layers, _) = &*guard; let Some(open_layer) = layers.open_layer.as_ref() else { return Ok(()); }; @@ -1040,7 +1147,8 @@ impl Timeline { } pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo { - let layer_map = self.layers.read().await; + let guard = self.layers.read().await; + let (layer_map, mapping) = &*guard; let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1); if let Some(open_layer) = &layer_map.open_layer { in_memory_layers.push(open_layer.info()); @@ -1051,6 +1159,7 @@ impl Timeline { let mut historic_layers = Vec::new(); for historic_layer in layer_map.iter_historic_layers() { + let historic_layer = mapping.get_from_desc(&historic_layer); historic_layers.push(historic_layer.info(reset)); } @@ -1160,7 +1269,8 @@ impl Timeline { } // start the batch update - let mut layer_map = self.layers.write().await; + let mut guard = self.layers.write().await; + let (layer_map, mapping) = &mut *guard; let mut batch_updates = layer_map.batch_update(); let mut results = Vec::with_capacity(layers_to_evict.len()); @@ -1169,14 +1279,19 @@ impl Timeline { let res = if cancel.is_cancelled() { None } else { - Some(self.evict_layer_batch_impl(&layer_removal_guard, l, &mut batch_updates)) + Some(self.evict_layer_batch_impl( + &layer_removal_guard, + l, + &mut batch_updates, + mapping, + )) }; results.push(res); } // commit the updates & release locks batch_updates.flush(); - drop(layer_map); + drop_wlock(guard); drop(layer_removal_guard); assert_eq!(results.len(), layers_to_evict.len()); @@ -1187,10 +1302,9 @@ impl Timeline { &self, _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, local_layer: &Arc, - batch_updates: &mut BatchedUpdates<'_, dyn PersistentLayer>, + batch_updates: &mut BatchedUpdates<'_>, + mapping: &mut LayerFileManager, ) -> anyhow::Result { - use super::layer_map::Replacement; - if local_layer.is_remote_layer() { // TODO(issue #3851): consider returning an err here instead of false, // which is the same out the match later @@ -1238,13 +1352,10 @@ impl Timeline { ), }); - let replaced = match batch_updates.replace_historic( - local_layer.layer_desc().clone(), - local_layer, - new_remote_layer.layer_desc().clone(), - new_remote_layer, - )? { - Replacement::Replaced { .. } => { + assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc()); + + let succeed = match mapping.replace_and_verify(local_layer.clone(), new_remote_layer) { + Ok(()) => { if let Err(e) = local_layer.delete_resident_layer_file() { error!("failed to remove layer file on evict after replacement: {e:#?}"); } @@ -1277,27 +1388,23 @@ impl Timeline { true } - Replacement::NotFound => { - debug!(evicted=?local_layer, "layer was no longer in layer map"); - false - } - Replacement::RemovalBuffered => { - unreachable!("not doing anything else in this batch") - } - Replacement::Unexpected(other) => { - error!( - local_layer.ptr=?Arc::as_ptr(local_layer), - other.ptr=?Arc::as_ptr(&other), - ?other, - "failed to replace"); + Err(err) => { + if cfg!(debug_assertions) { + panic!("failed to replace: {err}, evicted: {local_layer:?}"); + } else { + error!(evicted=?local_layer, "failed to replace: {err}"); + } false } }; - Ok(replaced) + Ok(succeed) } } +/// Number of times we will compute partition within a checkpoint distance. +const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10; + // Private functions impl Timeline { fn get_checkpoint_distance(&self) -> u64 { @@ -1418,7 +1525,10 @@ impl Timeline { timeline_id, tenant_id, pg_version, - layers: Arc::new(tokio::sync::RwLock::new(LayerMap::default())), + layers: Arc::new(tokio::sync::RwLock::new(( + LayerMap::default(), + LayerFileManager::new(), + ))), wanted_image_layers: Mutex::new(None), walredo_mgr, @@ -1492,7 +1602,8 @@ impl Timeline { initial_logical_size_can_start, initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt), }; - result.repartition_threshold = result.get_checkpoint_distance() / 10; + result.repartition_threshold = + result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; result .metrics .last_record_gauge @@ -1602,14 +1713,15 @@ impl Timeline { let mut layers = self.layers.try_write().expect( "in the context where we call this function, no other task has access to the object", ); - layers.next_open_layer_at = Some(Lsn(start_lsn.0)); + layers.0.next_open_layer_at = Some(Lsn(start_lsn.0)); } /// /// Scan the timeline directory to populate the layer map. /// pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { - let mut layers = self.layers.write().await; + let mut guard = self.layers.write().await; + let (layers, mapping) = &mut *guard; let mut updates = layers.batch_update(); let mut num_layers = 0; @@ -1652,7 +1764,7 @@ impl Timeline { trace!("found layer {}", layer.path().display()); total_physical_size += file_size; - updates.insert_historic(layer.layer_desc().clone(), Arc::new(layer)); + self.insert_historic_layer(Arc::new(layer), &mut updates, mapping); num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { // Create a DeltaLayer struct for each delta file. @@ -1684,7 +1796,7 @@ impl Timeline { trace!("found layer {}", layer.path().display()); total_physical_size += file_size; - updates.insert_historic(layer.layer_desc().clone(), Arc::new(layer)); + self.insert_historic_layer(Arc::new(layer), &mut updates, mapping); num_layers += 1; } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { // ignore these @@ -1738,7 +1850,8 @@ impl Timeline { // We're holding a layer map lock for a while but this // method is only called during init so it's fine. - let mut layer_map = self.layers.write().await; + let mut guard = self.layers.write().await; + let (layer_map, mapping) = &mut *guard; let mut updates = layer_map.batch_update(); for remote_layer_name in &index_part.timeline_layers { let local_layer = local_only_layers.remove(remote_layer_name); @@ -1783,7 +1896,7 @@ impl Timeline { anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); } else { self.metrics.resident_physical_size_gauge.sub(local_size); - updates.remove_historic(local_layer.layer_desc().clone(), local_layer); + self.remove_historic_layer(local_layer, &mut updates, mapping); // fall-through to adding the remote layer } } else { @@ -1822,7 +1935,7 @@ impl Timeline { ); let remote_layer = Arc::new(remote_layer); - updates.insert_historic(remote_layer.layer_desc().clone(), remote_layer); + self.insert_historic_layer(remote_layer, &mut updates, mapping); } LayerFileName::Delta(deltafilename) => { // Create a RemoteLayer for the delta file. @@ -1849,7 +1962,7 @@ impl Timeline { ), ); let remote_layer = Arc::new(remote_layer); - updates.insert_historic(remote_layer.layer_desc().clone(), remote_layer); + self.insert_historic_layer(remote_layer, &mut updates, mapping); } } } @@ -1888,13 +2001,14 @@ impl Timeline { let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn(); - let local_layers = self - .layers - .read() - .await - .iter_historic_layers() - .map(|l| (l.filename(), l)) - .collect::>(); + let local_layers = { + let guard = self.layers.read().await; + let (layers, mapping) = &*guard; + layers + .iter_historic_layers() + .map(|l| (l.filename(), mapping.get_from_desc(&l))) + .collect::>() + }; // If no writes happen, new branches do not have any layers, only the metadata file. let has_local_layers = !local_layers.is_empty(); @@ -2265,25 +2379,53 @@ impl Timeline { } async fn find_layer(&self, layer_file_name: &str) -> Option> { - for historic_layer in self.layers.read().await.iter_historic_layers() { + let guard = self.layers.read().await; + let (layers, mapping) = &*guard; + for historic_layer in layers.iter_historic_layers() { let historic_layer_name = historic_layer.filename().file_name(); if layer_file_name == historic_layer_name { - return Some(historic_layer); + return Some(mapping.get_from_desc(&historic_layer)); } } None } + /// Helper function to insert a layer from both layer map and layer file manager. Will be removed in the future + /// after we introduce `LayerMapManager`. + fn insert_historic_layer( + &self, + layer: Arc, + updates: &mut BatchedUpdates<'_>, + mapping: &mut LayerFileManager, + ) { + updates.insert_historic(layer.layer_desc().clone()); + mapping.insert(layer); + } + + /// Helper function to remove a layer from both layer map and layer file manager. Will be removed in the future + /// after we introduce `LayerMapManager`. + fn remove_historic_layer( + &self, + layer: Arc, + updates: &mut BatchedUpdates<'_>, + mapping: &mut LayerFileManager, + ) { + updates.remove_historic(layer.layer_desc().clone()); + mapping.remove(layer); + } + /// Removes the layer from local FS (if present) and from memory. /// Remote storage is not affected by this operation. fn delete_historic_layer( &self, // we cannot remove layers otherwise, since gc and compaction will race _layer_removal_cs: Arc>, - layer: Arc, - updates: &mut BatchedUpdates<'_, dyn PersistentLayer>, + layer: Arc, + updates: &mut BatchedUpdates<'_>, + mapping: &mut LayerFileManager, ) -> anyhow::Result<()> { + let layer = mapping.get_from_desc(&layer); if !layer.is_remote_layer() { layer.delete_resident_layer_file()?; let layer_file_size = layer.file_size(); @@ -2297,7 +2439,8 @@ impl Timeline { // won't be needed for page reconstruction for this timeline, // and mark what we can't delete yet as deleted from the layer // map index without actually rebuilding the index. - updates.remove_historic(layer.layer_desc().clone(), layer); + updates.remove_historic(layer.layer_desc().clone()); + mapping.remove(layer); Ok(()) } @@ -2482,7 +2625,8 @@ impl Timeline { #[allow(clippy::never_loop)] // see comment at bottom of this loop 'layer_map_search: loop { let remote_layer = { - let layers = timeline.layers.read().await; + let guard = timeline.layers.read().await; + let (layers, mapping) = &*guard; // Check the open and frozen in-memory layers first, in order from newest // to oldest. @@ -2544,6 +2688,7 @@ impl Timeline { } if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { + let layer = mapping.get_from_desc(&layer); // If it's a remote layer, download it and retry. if let Some(remote_layer) = super::storage_layer::downcast_remote_layer(&layer) @@ -2665,7 +2810,8 @@ impl Timeline { /// Get a handle to the latest layer for appending. /// async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { - let mut layers = self.layers.write().await; + let mut guard = self.layers.write().await; + let (layers, _) = &mut *guard; ensure!(lsn.is_aligned()); @@ -2742,7 +2888,8 @@ impl Timeline { } else { Some(self.write_lock.lock().await) }; - let mut layers = self.layers.write().await; + let mut guard = self.layers.write().await; + let (layers, _) = &mut *guard; if let Some(open_layer) = &layers.open_layer { let open_layer_rc = Arc::clone(open_layer); // Does this layer need freezing? @@ -2756,7 +2903,7 @@ impl Timeline { layers.next_open_layer_at = Some(end_lsn); self.last_freeze_at.store(end_lsn); } - drop(layers); + drop_wlock(guard); } /// Layer flusher task's main loop. @@ -2780,7 +2927,8 @@ impl Timeline { let flush_counter = *layer_flush_start_rx.borrow(); let result = loop { let layer_to_flush = { - let layers = self.layers.read().await; + let guard = self.layers.read().await; + let (layers, _) = &*guard; layers.frozen_layers.front().cloned() // drop 'layers' lock to allow concurrent reads and writes }; @@ -2903,15 +3051,17 @@ impl Timeline { fail_point!("flush-frozen-before-sync"); // The new on-disk layers are now in the layer map. We can remove the - // in-memory layer from the map now. + // in-memory layer from the map now. We do not modify `LayerFileManager` because + // it only contains persistent layers. The flushed layer is stored in + // the mapping in `create_delta_layer`. { let mut layers = self.layers.write().await; - let l = layers.frozen_layers.pop_front(); + let l = layers.0.frozen_layers.pop_front(); // Only one thread may call this function at a time (for this // timeline). If two threads tried to flush the same frozen // layer to disk at the same time, that would not work. - assert!(LayerMap::compare_arced_layers(&l.unwrap(), &frozen_layer)); + assert!(compare_arced_layers(&l.unwrap(), &frozen_layer)); // release lock on 'layers' } @@ -3044,14 +3194,15 @@ impl Timeline { // Add it to the layer map let l = Arc::new(new_delta); - let mut layers = self.layers.write().await; + let mut guard = self.layers.write().await; + let (layers, mapping) = &mut *guard; let mut batch_updates = layers.batch_update(); l.access_stats().record_residence_event( &batch_updates, LayerResidenceStatus::Resident, LayerResidenceEventReason::LayerCreate, ); - batch_updates.insert_historic(l.layer_desc().clone(), l); + self.insert_historic_layer(l, &mut batch_updates, mapping); batch_updates.flush(); // update metrics @@ -3100,7 +3251,8 @@ impl Timeline { ) -> anyhow::Result { let threshold = self.get_image_creation_threshold(); - let layers = self.layers.read().await; + let guard = self.layers.read().await; + let (layers, _) = &*guard; let mut max_deltas = 0; { @@ -3278,7 +3430,8 @@ impl Timeline { let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len()); - let mut layers = self.layers.write().await; + let mut guard = self.layers.write().await; + let (layers, mapping) = &mut *guard; let mut updates = layers.batch_update(); let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); @@ -3300,10 +3453,10 @@ impl Timeline { LayerResidenceStatus::Resident, LayerResidenceEventReason::LayerCreate, ); - updates.insert_historic(l.layer_desc().clone(), l); + self.insert_historic_layer(l, &mut updates, mapping); } updates.flush(); - drop(layers); + drop_wlock(guard); timer.stop_and_record(); Ok(layer_paths_to_upload) @@ -3313,7 +3466,7 @@ impl Timeline { #[derive(Default)] struct CompactLevel0Phase1Result { new_layers: Vec, - deltas_to_compact: Vec>, + deltas_to_compact: Vec>, } /// Top-level failure to compact. @@ -3464,14 +3617,19 @@ impl Timeline { fn compact_level0_phase1( self: Arc, _layer_removal_cs: Arc>, - layers: tokio::sync::OwnedRwLockReadGuard>, + guard: tokio::sync::OwnedRwLockReadGuard<(LayerMap, LayerFileManager)>, mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, ctx: &RequestContext, ) -> Result { stats.read_lock_held_spawn_blocking_startup_micros = stats.read_lock_acquisition_micros.till_now(); // set by caller - let mut level0_deltas = layers.get_level0_deltas()?; + let (layers, mapping) = &*guard; + let level0_deltas = layers.get_level0_deltas()?; + let mut level0_deltas = level0_deltas + .into_iter() + .map(|x| mapping.get_from_desc(&x)) + .collect_vec(); stats.level0_deltas_count = Some(level0_deltas.len()); // Only compact if enough layers have accumulated. let threshold = self.get_compaction_threshold(); @@ -3591,7 +3749,7 @@ impl Timeline { } stats.read_lock_held_compute_holes_micros = stats.read_lock_held_prerequisites_micros.till_now(); - drop(layers); + drop_rlock(guard); stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); let mut holes = heap.into_vec(); holes.sort_unstable_by_key(|hole| hole.key_range.start); @@ -3818,7 +3976,10 @@ impl Timeline { Ok(CompactLevel0Phase1Result { new_layers, - deltas_to_compact, + deltas_to_compact: deltas_to_compact + .into_iter() + .map(|x| Arc::new(x.layer_desc().clone())) + .collect(), }) } @@ -3882,7 +4043,8 @@ impl Timeline { .context("wait for layer upload ops to complete")?; } - let mut layers = self.layers.write().await; + let mut guard = self.layers.write().await; + let (layers, mapping) = &mut *guard; let mut updates = layers.batch_update(); let mut new_layer_paths = HashMap::with_capacity(new_layers.len()); for l in new_layers { @@ -3914,7 +4076,7 @@ impl Timeline { LayerResidenceStatus::Resident, LayerResidenceEventReason::LayerCreate, ); - updates.insert_historic(x.layer_desc().clone(), x); + self.insert_historic_layer(x, &mut updates, mapping); } // Now that we have reshuffled the data to set of new delta layers, we can @@ -3922,10 +4084,13 @@ impl Timeline { let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len()); for l in deltas_to_compact { layer_names_to_delete.push(l.filename()); - self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates)?; + // NB: the layer file identified by descriptor `l` is guaranteed to be present + // in the LayerFileManager because we kept holding `layer_removal_cs` the entire + // time, even though we dropped `Timeline::layers` inbetween. + self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates, mapping)?; } updates.flush(); - drop(layers); + drop_wlock(guard); // Also schedule the deletions in remote storage if let Some(remote_client) = &self.remote_client { @@ -4142,7 +4307,8 @@ impl Timeline { // 4. newer on-disk image layers cover the layer's whole key range // // TODO holding a write lock is too agressive and avoidable - let mut layers = self.layers.write().await; + let mut guard = self.layers.write().await; + let (layers, mapping) = &mut *guard; 'outer: for l in layers.iter_historic_layers() { result.layers_total += 1; @@ -4221,7 +4387,7 @@ impl Timeline { // delta layers. Image layers can form "stairs" preventing old image from been deleted. // But image layers are in any case less sparse than delta layers. Also we need some // protection from replacing recent image layers with new one after each GC iteration. - if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&*l) { + if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&l) { wanted_image_layers.add_range(l.get_key_range()); } result.layers_not_updated += 1; @@ -4258,6 +4424,7 @@ impl Timeline { layer_removal_cs.clone(), doomed_layer, &mut updates, + mapping, )?; // FIXME: schedule succeeded deletions before returning? result.layers_removed += 1; } @@ -4442,42 +4609,15 @@ impl Timeline { // Download complete. Replace the RemoteLayer with the corresponding // Delta- or ImageLayer in the layer map. - let mut layers = self_clone.layers.write().await; - let mut updates = layers.batch_update(); - let new_layer = remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size); + let mut guard = self_clone.layers.write().await; + let (layers, mapping) = &mut *guard; + let updates = layers.batch_update(); + let new_layer = + remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size); { - use crate::tenant::layer_map::Replacement; let l: Arc = remote_layer.clone(); - let failure = match updates.replace_historic(l.layer_desc().clone(), &l, new_layer.layer_desc().clone(), new_layer) { - Ok(Replacement::Replaced { .. }) => false, - Ok(Replacement::NotFound) => { - // TODO: the downloaded file should probably be removed, otherwise - // it will be added to the layermap on next load? we should - // probably restart any get_reconstruct_data search as well. - // - // See: https://github.com/neondatabase/neon/issues/3533 - error!("replacing downloaded layer into layermap failed because layer was not found"); - true - } - Ok(Replacement::RemovalBuffered) => { - unreachable!("current implementation does not remove anything") - } - Ok(Replacement::Unexpected(other)) => { - // if the other layer would have the same pointer value as - // expected, it means they differ only on vtables. - // - // otherwise there's no known reason for this to happen as - // compacted layers should have different covering rectangle - // leading to produce Replacement::NotFound. - - error!( - expected.ptr = ?Arc::as_ptr(&l), - other.ptr = ?Arc::as_ptr(&other), - ?other, - "replacing downloaded layer into layermap failed because another layer was found instead of expected" - ); - true - } + let failure = match mapping.replace_and_verify(l, new_layer) { + Ok(()) => false, Err(e) => { // this is a precondition failure, the layer filename derived // attributes didn't match up, which doesn't seem likely. @@ -4505,7 +4645,7 @@ impl Timeline { } } updates.flush(); - drop(layers); + drop_wlock(guard); info!("on-demand download successful"); @@ -4516,7 +4656,10 @@ impl Timeline { remote_layer.ongoing_download.close(); } else { // Keep semaphore open. We'll drop the permit at the end of the function. - error!("layer file download failed: {:?}", result.as_ref().unwrap_err()); + error!( + "layer file download failed: {:?}", + result.as_ref().unwrap_err() + ); } // Don't treat it as an error if the task that triggered the download @@ -4530,7 +4673,8 @@ impl Timeline { drop(permit); Ok(()) - }.in_current_span(), + } + .in_current_span(), ); receiver.await.context("download task cancelled")? @@ -4600,9 +4744,11 @@ impl Timeline { ) { let mut downloads = Vec::new(); { - let layers = self.layers.read().await; + let guard = self.layers.read().await; + let (layers, mapping) = &*guard; layers .iter_historic_layers() + .map(|l| mapping.get_from_desc(&l)) .filter_map(|l| l.downcast_remote_layer()) .map(|l| self.download_remote_layer(l)) .for_each(|dl| downloads.push(dl)) @@ -4703,7 +4849,8 @@ impl LocalLayerInfoForDiskUsageEviction { impl Timeline { pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { - let layers = self.layers.read().await; + let guard = self.layers.read().await; + let (layers, mapping) = &*guard; let mut max_layer_size: Option = None; let mut resident_layers = Vec::new(); @@ -4712,6 +4859,8 @@ impl Timeline { let file_size = l.file_size(); max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); + let l = mapping.get_from_desc(&l); + if l.is_remote_layer() { continue; } @@ -4870,3 +5019,31 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { ), } } + +/// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables. +/// +/// Returns `true` if the two `Arc` point to the same layer, false otherwise. +/// +/// If comparing persistent layers, ALWAYS compare the layer descriptor key. +#[inline(always)] +pub fn compare_arced_layers(left: &Arc, right: &Arc) -> bool { + // "dyn Trait" objects are "fat pointers" in that they have two components: + // - pointer to the object + // - pointer to the vtable + // + // rust does not provide a guarantee that these vtables are unique, but however + // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the + // pointer and the vtable need to be equal. + // + // See: https://github.com/rust-lang/rust/issues/103763 + // + // A future version of rust will most likely use this form below, where we cast each + // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it + // not affect the comparison. + // + // See: https://github.com/rust-lang/rust/pull/106450 + let left = Arc::as_ptr(left) as *const (); + let right = Arc::as_ptr(right) as *const (); + + left == right +} diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 80c5210211..03cf2d89ad 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -197,9 +197,11 @@ impl Timeline { // We don't want to hold the layer map lock during eviction. // So, we just need to deal with this. let candidates: Vec> = { - let layers = self.layers.read().await; + let guard = self.layers.read().await; + let (layers, mapping) = &*guard; let mut candidates = Vec::new(); for hist_layer in layers.iter_historic_layers() { + let hist_layer = mapping.get_from_desc(&hist_layer); if hist_layer.is_remote_layer() { continue; } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index fb216123c1..b86d14f158 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -302,15 +302,6 @@ impl VirtualFile { .observe_closure_duration(|| self.open_options.open(&self.path))?; // Perform the requested operation on it - // - // TODO: We could downgrade the locks to read mode before calling - // 'func', to allow a little bit more concurrency, but the standard - // library RwLock doesn't allow downgrading without releasing the lock, - // and that doesn't seem worth the trouble. - // - // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implementation is fair and - // may deadlock on subsequent read calls. - // Simply replacing all `RwLock` in project causes deadlocks, so use it sparingly. let result = STORAGE_IO_TIME .with_label_values(&[op, &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| func(&file)); diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c index 434f4986f8..45bf78ed3b 100644 --- a/pgxn/hnsw/hnsw.c +++ b/pgxn/hnsw/hnsw.c @@ -122,6 +122,43 @@ hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel) true, true, hnsw_build_callback, (void *) hnsw, NULL); } +#ifdef __APPLE__ + +#include +#include + +static void +hnsw_check_available_memory(Size requested) +{ + size_t total; + if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0) + elog(ERROR, "Failed to get amount of RAM: %m"); + + if ((Size)NBuffers*BLCKSZ + requested >= total) + elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available", + requested, total - (Size)NBuffers*BLCKSZ); +} + +#else + +#include + +static void +hnsw_check_available_memory(Size requested) +{ + struct sysinfo si; + Size total; + if (sysinfo(&si) < 0) + elog(ERROR, "Failed to get amount of RAM: %n"); + + total = si.totalram*si.mem_unit; + if ((Size)NBuffers*BLCKSZ + requested >= total) + elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available", + requested, total - (Size)NBuffers*BLCKSZ); +} + +#endif + static HierarchicalNSW* hnsw_get_index(Relation indexRel, Relation heapRel) { @@ -156,6 +193,8 @@ hnsw_get_index(Relation indexRel, Relation heapRel) size_data_per_element = size_links_level0 + data_size + sizeof(label_t); shmem_size = hnsw_sizeof() + maxelements * size_data_per_element; + hnsw_check_available_memory(shmem_size); + /* first try to attach to existed index */ if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private, &mapped_address, &mapped_size, DEBUG1)) diff --git a/pgxn/hnsw/hnsw.control b/pgxn/hnsw/hnsw.control index b292b96026..8b75c350a8 100644 --- a/pgxn/hnsw/hnsw.control +++ b/pgxn/hnsw/hnsw.control @@ -1,4 +1,4 @@ -comment = 'hNsw index' +comment = 'hnsw index' default_version = '0.1.0' module_pathname = '$libdir/hnsw' relocatable = true diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index 82e4af4b4a..debbbce117 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -32,6 +32,7 @@ #include "port.h" #include #include "utils/jsonb.h" +#include "libpq/crypt.h" static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; @@ -161,7 +162,22 @@ ConstructDeltaMessage() PushKeyValue(&state, "name", entry->name); if (entry->password) { +#if PG_MAJORVERSION_NUM == 14 + char *logdetail; +#else + const char *logdetail; +#endif PushKeyValue(&state, "password", (char *) entry->password); + char *encrypted_password = get_role_password(entry->name, &logdetail); + + if (encrypted_password) + { + PushKeyValue(&state, "encrypted_password", encrypted_password); + } + else + { + elog(ERROR, "Failed to get encrypted password: %s", logdetail); + } } if (entry->old_name[0] != '\0') { diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 1ab2ae668a..02795bc8b8 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -190,7 +190,7 @@ lfc_change_limit_hook(int newval, void *extra) hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL); lfc_ctl->used -= 1; } - elog(LOG, "set local file cache limit to %d", new_size); + elog(DEBUG1, "set local file cache limit to %d", new_size); LWLockRelease(lfc_lock); } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 528d4eb051..79dec0881d 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -2675,7 +2675,6 @@ bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) { XLogRecPtr end_recptr = record->EndRecPtr; - XLogRecPtr prev_end_recptr = record->ReadRecPtr - 1; RelFileNode rnode; ForkNumber forknum; BlockNumber blkno; @@ -2719,16 +2718,15 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) no_redo_needed = buffer < 0; - /* we don't have the buffer in memory, update lwLsn past this record */ + /* In both cases st lwlsn past this WAL record */ + SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno); + + /* we don't have the buffer in memory, update lwLsn past this record, + * also evict page fro file cache + */ if (no_redo_needed) - { - SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno); lfc_evict(rnode, forknum, blkno); - } - else - { - SetLastWrittenLSNForBlock(prev_end_recptr, rnode, forknum, blkno); - } + LWLockRelease(partitionLock); @@ -2736,7 +2734,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) if (get_cached_relsize(rnode, forknum, &relsize)) { if (relsize < blkno + 1) + { update_cached_relsize(rnode, forknum, blkno + 1); + SetLastWrittenLSNForRelation(end_recptr, rnode, forknum); + } } else { @@ -2768,6 +2769,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) Assert(nbresponse->n_blocks > blkno); set_cached_relsize(rnode, forknum, nbresponse->n_blocks); + SetLastWrittenLSNForRelation(end_recptr, rnode, forknum); elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks); } diff --git a/poetry.lock b/poetry.lock index 8dc45f68b8..aa76ac6dbd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1654,71 +1654,74 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] [[package]] name = "psycopg2-binary" -version = "2.9.3" +version = "2.9.6" description = "psycopg2 - Python-PostgreSQL Database Adapter" category = "main" optional = false python-versions = ">=3.6" files = [ - {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_ppc64le.whl", hash = "sha256:0a29729145aaaf1ad8bafe663131890e2111f13416b60e460dae0a96af5905c9"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3a79d622f5206d695d7824cbf609a4f5b88ea6d6dab5f7c147fc6d333a8787e4"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:090f3348c0ab2cceb6dfbe6bf721ef61262ddf518cd6cc6ecc7d334996d64efa"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:a9e1f75f96ea388fbcef36c70640c4efbe4650658f3d6a2967b4cc70e907352e"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c3ae8e75eb7160851e59adc77b3a19a976e50622e44fd4fd47b8b18208189d42"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-win32.whl", hash = "sha256:7b1e9b80afca7b7a386ef087db614faebbf8839b7f4db5eb107d0f1a53225029"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:8b344adbb9a862de0c635f4f0425b7958bf5a4b927c8594e6e8d261775796d53"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:e847774f8ffd5b398a75bc1c18fbb56564cda3d629fe68fd81971fece2d3c67e"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:68641a34023d306be959101b345732360fc2ea4938982309b786f7be1b43a4a1"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3303f8807f342641851578ee7ed1f3efc9802d00a6f83c101d21c608cb864460"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_24_aarch64.whl", hash = "sha256:e3699852e22aa68c10de06524a3721ade969abf382da95884e6a10ff798f9281"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_24_ppc64le.whl", hash = "sha256:526ea0378246d9b080148f2d6681229f4b5964543c170dd10bf4faaab6e0d27f"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:b1c8068513f5b158cf7e29c43a77eb34b407db29aca749d3eb9293ee0d3103ca"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:15803fa813ea05bef089fa78835118b5434204f3a17cb9f1e5dbfd0b9deea5af"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:152f09f57417b831418304c7f30d727dc83a12761627bb826951692cc6491e57"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:404224e5fef3b193f892abdbf8961ce20e0b6642886cfe1fe1923f41aaa75c9d"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-win32.whl", hash = "sha256:1f6b813106a3abdf7b03640d36e24669234120c72e91d5cbaeb87c5f7c36c65b"}, - {file = "psycopg2_binary-2.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:2d872e3c9d5d075a2e104540965a1cf898b52274a5923936e5bfddb58c59c7c2"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:10bb90fb4d523a2aa67773d4ff2b833ec00857f5912bafcfd5f5414e45280fb1"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:874a52ecab70af13e899f7847b3e074eeb16ebac5615665db33bce8a1009cf33"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a29b3ca4ec9defec6d42bf5feb36bb5817ba3c0230dd83b4edf4bf02684cd0ae"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:12b11322ea00ad8db8c46f18b7dfc47ae215e4df55b46c67a94b4effbaec7094"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_24_ppc64le.whl", hash = "sha256:53293533fcbb94c202b7c800a12c873cfe24599656b341f56e71dd2b557be063"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c381bda330ddf2fccbafab789d83ebc6c53db126e4383e73794c74eedce855ef"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:9d29409b625a143649d03d0fd7b57e4b92e0ecad9726ba682244b73be91d2fdb"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:183a517a3a63503f70f808b58bfbf962f23d73b6dccddae5aa56152ef2bcb232"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:15c4e4cfa45f5a60599d9cec5f46cd7b1b29d86a6390ec23e8eebaae84e64554"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"}, - {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_ppc64le.whl", hash = "sha256:63638d875be8c2784cfc952c9ac34e2b50e43f9f0a0660b65e2a87d656b3116c"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ffb7a888a047696e7f8240d649b43fb3644f14f0ee229077e7f6b9f9081635bd"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0c9d5450c566c80c396b7402895c4369a410cab5a82707b11aee1e624da7d004"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:d1c1b569ecafe3a69380a94e6ae09a4789bbb23666f3d3a08d06bbd2451f5ef1"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8fc53f9af09426a61db9ba357865c77f26076d48669f2e1bb24d85a22fb52307"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_ppc64le.whl", hash = "sha256:7af0dd86ddb2f8af5da57a976d27cd2cd15510518d582b478fbb2292428710b4"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:93cd1967a18aa0edd4b95b1dfd554cf15af657cb606280996d393dadc88c3c35"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bda845b664bb6c91446ca9609fc69f7db6c334ec5e4adc87571c34e4f47b7ddb"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:01310cf4cf26db9aea5158c217caa92d291f0500051a6469ac52166e1a16f5b7"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:99485cab9ba0fa9b84f1f9e1fef106f44a46ef6afdeec8885e0b88d0772b49e8"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-win32.whl", hash = "sha256:46f0e0a6b5fa5851bbd9ab1bc805eef362d3a230fbdfbc209f4a236d0a7a990d"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:accfe7e982411da3178ec690baaceaad3c278652998b2c45828aaac66cd8285f"}, + {file = "psycopg2-binary-2.9.6.tar.gz", hash = "sha256:1f64dcfb8f6e0c014c7f55e51c9759f024f70ea572fbdef123f85318c297947c"}, + {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d26e0342183c762de3276cca7a530d574d4e25121ca7d6e4a98e4f05cb8e4df7"}, + {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c48d8f2db17f27d41fb0e2ecd703ea41984ee19362cbce52c097963b3a1b4365"}, + {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffe9dc0a884a8848075e576c1de0290d85a533a9f6e9c4e564f19adf8f6e54a7"}, + {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a76e027f87753f9bd1ab5f7c9cb8c7628d1077ef927f5e2446477153a602f2c"}, + {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6460c7a99fc939b849431f1e73e013d54aa54293f30f1109019c56a0b2b2ec2f"}, + {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae102a98c547ee2288637af07393dd33f440c25e5cd79556b04e3fca13325e5f"}, + {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9972aad21f965599ed0106f65334230ce826e5ae69fda7cbd688d24fa922415e"}, + {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7a40c00dbe17c0af5bdd55aafd6ff6679f94a9be9513a4c7e071baf3d7d22a70"}, + {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:cacbdc5839bdff804dfebc058fe25684cae322987f7a38b0168bc1b2df703fb1"}, + {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7f0438fa20fb6c7e202863e0d5ab02c246d35efb1d164e052f2f3bfe2b152bd0"}, + {file = "psycopg2_binary-2.9.6-cp310-cp310-win32.whl", hash = "sha256:b6c8288bb8a84b47e07013bb4850f50538aa913d487579e1921724631d02ea1b"}, + {file = "psycopg2_binary-2.9.6-cp310-cp310-win_amd64.whl", hash = "sha256:61b047a0537bbc3afae10f134dc6393823882eb263088c271331602b672e52e9"}, + {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:964b4dfb7c1c1965ac4c1978b0f755cc4bd698e8aa2b7667c575fb5f04ebe06b"}, + {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afe64e9b8ea66866a771996f6ff14447e8082ea26e675a295ad3bdbffdd72afb"}, + {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e2ee79e7cf29582ef770de7dab3d286431b01c3bb598f8e05e09601b890081"}, + {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dfa74c903a3c1f0d9b1c7e7b53ed2d929a4910e272add6700c38f365a6002820"}, + {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b83456c2d4979e08ff56180a76429263ea254c3f6552cd14ada95cff1dec9bb8"}, + {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0645376d399bfd64da57148694d78e1f431b1e1ee1054872a5713125681cf1be"}, + {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e99e34c82309dd78959ba3c1590975b5d3c862d6f279f843d47d26ff89d7d7e1"}, + {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4ea29fc3ad9d91162c52b578f211ff1c931d8a38e1f58e684c45aa470adf19e2"}, + {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:4ac30da8b4f57187dbf449294d23b808f8f53cad6b1fc3623fa8a6c11d176dd0"}, + {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e78e6e2a00c223e164c417628572a90093c031ed724492c763721c2e0bc2a8df"}, + {file = "psycopg2_binary-2.9.6-cp311-cp311-win32.whl", hash = "sha256:1876843d8e31c89c399e31b97d4b9725a3575bb9c2af92038464231ec40f9edb"}, + {file = "psycopg2_binary-2.9.6-cp311-cp311-win_amd64.whl", hash = "sha256:b4b24f75d16a89cc6b4cdff0eb6a910a966ecd476d1e73f7ce5985ff1328e9a6"}, + {file = "psycopg2_binary-2.9.6-cp36-cp36m-win32.whl", hash = "sha256:498807b927ca2510baea1b05cc91d7da4718a0f53cb766c154c417a39f1820a0"}, + {file = "psycopg2_binary-2.9.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0d236c2825fa656a2d98bbb0e52370a2e852e5a0ec45fc4f402977313329174d"}, + {file = "psycopg2_binary-2.9.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:34b9ccdf210cbbb1303c7c4db2905fa0319391bd5904d32689e6dd5c963d2ea8"}, + {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84d2222e61f313c4848ff05353653bf5f5cf6ce34df540e4274516880d9c3763"}, + {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30637a20623e2a2eacc420059be11527f4458ef54352d870b8181a4c3020ae6b"}, + {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8122cfc7cae0da9a3077216528b8bb3629c43b25053284cc868744bfe71eb141"}, + {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38601cbbfe600362c43714482f43b7c110b20cb0f8172422c616b09b85a750c5"}, + {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c7e62ab8b332147a7593a385d4f368874d5fe4ad4e341770d4983442d89603e3"}, + {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2ab652e729ff4ad76d400df2624d223d6e265ef81bb8aa17fbd63607878ecbee"}, + {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c83a74b68270028dc8ee74d38ecfaf9c90eed23c8959fca95bd703d25b82c88e"}, + {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d4e6036decf4b72d6425d5b29bbd3e8f0ff1059cda7ac7b96d6ac5ed34ffbacd"}, + {file = "psycopg2_binary-2.9.6-cp37-cp37m-win32.whl", hash = "sha256:a8c28fd40a4226b4a84bdf2d2b5b37d2c7bd49486b5adcc200e8c7ec991dfa7e"}, + {file = "psycopg2_binary-2.9.6-cp37-cp37m-win_amd64.whl", hash = "sha256:51537e3d299be0db9137b321dfb6a5022caaab275775680e0c3d281feefaca6b"}, + {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cf4499e0a83b7b7edcb8dabecbd8501d0d3a5ef66457200f77bde3d210d5debb"}, + {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7e13a5a2c01151f1208d5207e42f33ba86d561b7a89fca67c700b9486a06d0e2"}, + {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e0f754d27fddcfd74006455b6e04e6705d6c31a612ec69ddc040a5468e44b4e"}, + {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d57c3fd55d9058645d26ae37d76e61156a27722097229d32a9e73ed54819982a"}, + {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71f14375d6f73b62800530b581aed3ada394039877818b2d5f7fc77e3bb6894d"}, + {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:441cc2f8869a4f0f4bb408475e5ae0ee1f3b55b33f350406150277f7f35384fc"}, + {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:65bee1e49fa6f9cf327ce0e01c4c10f39165ee76d35c846ade7cb0ec6683e303"}, + {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:af335bac6b666cc6aea16f11d486c3b794029d9df029967f9938a4bed59b6a19"}, + {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:cfec476887aa231b8548ece2e06d28edc87c1397ebd83922299af2e051cf2827"}, + {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:65c07febd1936d63bfde78948b76cd4c2a411572a44ac50719ead41947d0f26b"}, + {file = "psycopg2_binary-2.9.6-cp38-cp38-win32.whl", hash = "sha256:4dfb4be774c4436a4526d0c554af0cc2e02082c38303852a36f6456ece7b3503"}, + {file = "psycopg2_binary-2.9.6-cp38-cp38-win_amd64.whl", hash = "sha256:02c6e3cf3439e213e4ee930308dc122d6fb4d4bea9aef4a12535fbd605d1a2fe"}, + {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e9182eb20f41417ea1dd8e8f7888c4d7c6e805f8a7c98c1081778a3da2bee3e4"}, + {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8a6979cf527e2603d349a91060f428bcb135aea2be3201dff794813256c274f1"}, + {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8338a271cb71d8da40b023a35d9c1e919eba6cbd8fa20a54b748a332c355d896"}, + {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3ed340d2b858d6e6fb5083f87c09996506af483227735de6964a6100b4e6a54"}, + {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f81e65376e52f03422e1fb475c9514185669943798ed019ac50410fb4c4df232"}, + {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfb13af3c5dd3a9588000910178de17010ebcccd37b4f9794b00595e3a8ddad3"}, + {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4c727b597c6444a16e9119386b59388f8a424223302d0c06c676ec8b4bc1f963"}, + {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:4d67fbdaf177da06374473ef6f7ed8cc0a9dc640b01abfe9e8a2ccb1b1402c1f"}, + {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0892ef645c2fabb0c75ec32d79f4252542d0caec1d5d949630e7d242ca4681a3"}, + {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:02c0f3757a4300cf379eb49f543fb7ac527fb00144d39246ee40e1df684ab514"}, + {file = "psycopg2_binary-2.9.6-cp39-cp39-win32.whl", hash = "sha256:c3dba7dab16709a33a847e5cd756767271697041fbe3fe97c215b1fc1f5c9848"}, + {file = "psycopg2_binary-2.9.6-cp39-cp39-win_amd64.whl", hash = "sha256:f6a88f384335bb27812293fdb11ac6aee2ca3f51d3c7820fe03de0a304ab6249"}, ] [[package]] diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index e7a4fd236e..438dd62315 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -7,7 +7,6 @@ license.workspace = true [dependencies] anyhow.workspace = true async-trait.workspace = true -atty.workspace = true base64.workspace = true bstr.workspace = true bytes = { workspace = true, features = ["serde"] } @@ -30,6 +29,7 @@ metrics.workspace = true once_cell.workspace = true opentelemetry.workspace = true parking_lot.workspace = true +pbkdf2.workspace = true pin-project-lite.workspace = true postgres_backend.workspace = true pq_proto.workspace = true @@ -38,6 +38,7 @@ rand.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["json"] } reqwest-middleware.workspace = true +reqwest-retry.workspace = true reqwest-tracing.workspace = true routerify.workspace = true rustls-pemfile.workspace = true diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 480acb88d9..70b29679b9 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -136,18 +136,17 @@ impl Default for ConnCfg { impl ConnCfg { /// Establish a raw TCP connection to the compute node. - async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream, &str)> { + async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> { use tokio_postgres::config::Host; // wrap TcpStream::connect with timeout let connect_with_timeout = |host, port| { - let connection_timeout = Duration::from_millis(10000); - tokio::time::timeout(connection_timeout, TcpStream::connect((host, port))).map( + tokio::time::timeout(timeout, TcpStream::connect((host, port))).map( move |res| match res { Ok(tcpstream_connect_res) => tcpstream_connect_res, Err(_) => Err(io::Error::new( io::ErrorKind::TimedOut, - format!("exceeded connection timeout {connection_timeout:?}"), + format!("exceeded connection timeout {timeout:?}"), )), }, ) @@ -223,8 +222,9 @@ impl ConnCfg { async fn do_connect( &self, allow_self_signed_compute: bool, + timeout: Duration, ) -> Result { - let (socket_addr, stream, host) = self.connect_raw().await?; + let (socket_addr, stream, host) = self.connect_raw(timeout).await?; let tls_connector = native_tls::TlsConnector::builder() .danger_accept_invalid_certs(allow_self_signed_compute) @@ -264,8 +264,9 @@ impl ConnCfg { pub async fn connect( &self, allow_self_signed_compute: bool, + timeout: Duration, ) -> Result { - self.do_connect(allow_self_signed_compute) + self.do_connect(allow_self_signed_compute, timeout) .inspect_err(|err| { // Immediately log the error we have at our disposal. error!("couldn't connect to compute node: {err}"); diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 6a26cea78e..00f561fcf2 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -212,7 +212,7 @@ pub struct CacheOptions { impl CacheOptions { /// Default options for [`crate::auth::caches::NodeInfoCache`]. - pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=5m"; + pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=4m"; /// Parse cache options passed via cmdline. /// Example: [`Self::DEFAULT_OPTIONS_NODE_INFO`]. diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index 30364be6f4..35d1ff59b7 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -8,7 +8,7 @@ use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, Quer use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; use std::future; use tokio::net::{TcpListener, TcpStream}; -use tracing::{error, info, info_span}; +use tracing::{error, info, info_span, Instrument}; static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); @@ -44,19 +44,30 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<()> { .set_nodelay(true) .context("failed to set client socket option")?; - tokio::task::spawn(async move { - let span = info_span!("mgmt", peer = %peer_addr); - let _enter = span.enter(); + let span = info_span!("mgmt", peer = %peer_addr); - info!("started a new console management API thread"); - scopeguard::defer! { - info!("console management API thread is about to finish"); - } + tokio::task::spawn( + async move { + info!("serving a new console management API connection"); - if let Err(e) = handle_connection(socket).await { - error!("thread failed with an error: {e}"); + // these might be long running connections, have a separate logging for cancelling + // on shutdown and other ways of stopping. + let cancelled = scopeguard::guard(tracing::Span::current(), |span| { + let _e = span.entered(); + info!("console management API task cancelled"); + }); + + if let Err(e) = handle_connection(socket).await { + error!("serving failed with an error: {e}"); + } else { + info!("serving completed"); + } + + // we can no longer get dropped + scopeguard::ScopeGuard::into_inner(cancelled); } - }); + .instrument(span), + ); } } @@ -77,14 +88,14 @@ impl postgres_backend::Handler for MgmtHandler { pgb: &mut PostgresBackendTCP, query: &str, ) -> Result<(), QueryError> { - try_process_query(pgb, query).await.map_err(|e| { + try_process_query(pgb, query).map_err(|e| { error!("failed to process response: {e:?}"); e }) } } -async fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> { +fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> { let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?; let span = info_span!("event", session_id = resp.session_id); diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 5cf49b669c..e281240380 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -2,12 +2,16 @@ //! Other modules should use stuff from this module instead of //! directly relying on deps like `reqwest` (think loose coupling). +pub mod conn_pool; pub mod server; pub mod sql_over_http; pub mod websocket; +use std::time::Duration; + pub use reqwest::{Request, Response, StatusCode}; pub use reqwest_middleware::{ClientWithMiddleware, Error}; +pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use crate::url::ApiUrl; use reqwest_middleware::RequestBuilder; @@ -21,6 +25,24 @@ pub fn new_client() -> ClientWithMiddleware { .build() } +pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware { + let timeout_client = reqwest::ClientBuilder::new() + .timeout(default_timout) + .build() + .expect("Failed to create http client with timeout"); + + let retry_policy = + ExponentialBackoff::builder().build_with_total_retry_duration(default_timout); + + reqwest_middleware::ClientBuilder::new(timeout_client) + .with(reqwest_tracing::TracingMiddleware::default()) + // As per docs, "This middleware always errors when given requests with streaming bodies". + // That's all right because we only use this client to send `serde_json::RawValue`, which + // is not a stream. + .with(RetryTransientMiddleware::new_with_policy(retry_policy)) + .build() +} + /// Thin convenience wrapper for an API provided by an http endpoint. #[derive(Debug, Clone)] pub struct Endpoint { diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs new file mode 100644 index 0000000000..52c1e2f2ce --- /dev/null +++ b/proxy/src/http/conn_pool.rs @@ -0,0 +1,278 @@ +use parking_lot::Mutex; +use pq_proto::StartupMessageParams; +use std::fmt; +use std::{collections::HashMap, sync::Arc}; + +use futures::TryFutureExt; + +use crate::config; +use crate::{auth, console}; + +use super::sql_over_http::MAX_RESPONSE_SIZE; + +use crate::proxy::invalidate_cache; +use crate::proxy::NUM_RETRIES_WAKE_COMPUTE; + +use tracing::error; +use tracing::info; + +pub const APP_NAME: &str = "sql_over_http"; +const MAX_CONNS_PER_ENDPOINT: usize = 20; + +#[derive(Debug)] +pub struct ConnInfo { + pub username: String, + pub dbname: String, + pub hostname: String, + pub password: String, +} + +impl ConnInfo { + // hm, change to hasher to avoid cloning? + pub fn db_and_user(&self) -> (String, String) { + (self.dbname.clone(), self.username.clone()) + } +} + +impl fmt::Display for ConnInfo { + // use custom display to avoid logging password + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}@{}/{}", self.username, self.hostname, self.dbname) + } +} + +struct ConnPoolEntry { + conn: tokio_postgres::Client, + _last_access: std::time::Instant, +} + +// Per-endpoint connection pool, (dbname, username) -> Vec +// Number of open connections is limited by the `max_conns_per_endpoint`. +pub struct EndpointConnPool { + pools: HashMap<(String, String), Vec>, + total_conns: usize, +} + +pub struct GlobalConnPool { + // endpoint -> per-endpoint connection pool + // + // That should be a fairly conteded map, so return reference to the per-endpoint + // pool as early as possible and release the lock. + global_pool: Mutex>>>, + + // Maximum number of connections per one endpoint. + // Can mix different (dbname, username) connections. + // When running out of free slots for a particular endpoint, + // falls back to opening a new connection for each request. + max_conns_per_endpoint: usize, + + proxy_config: &'static crate::config::ProxyConfig, +} + +impl GlobalConnPool { + pub fn new(config: &'static crate::config::ProxyConfig) -> Arc { + Arc::new(Self { + global_pool: Mutex::new(HashMap::new()), + max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT, + proxy_config: config, + }) + } + + pub async fn get( + &self, + conn_info: &ConnInfo, + force_new: bool, + ) -> anyhow::Result { + let mut client: Option = None; + + if !force_new { + let pool = self.get_endpoint_pool(&conn_info.hostname).await; + + // find a pool entry by (dbname, username) if exists + let mut pool = pool.lock(); + let pool_entries = pool.pools.get_mut(&conn_info.db_and_user()); + if let Some(pool_entries) = pool_entries { + if let Some(entry) = pool_entries.pop() { + client = Some(entry.conn); + pool.total_conns -= 1; + } + } + } + + // ok return cached connection if found and establish a new one otherwise + if let Some(client) = client { + if client.is_closed() { + info!("pool: cached connection '{conn_info}' is closed, opening a new one"); + connect_to_compute(self.proxy_config, conn_info).await + } else { + info!("pool: reusing connection '{conn_info}'"); + Ok(client) + } + } else { + info!("pool: opening a new connection '{conn_info}'"); + connect_to_compute(self.proxy_config, conn_info).await + } + } + + pub async fn put( + &self, + conn_info: &ConnInfo, + client: tokio_postgres::Client, + ) -> anyhow::Result<()> { + let pool = self.get_endpoint_pool(&conn_info.hostname).await; + + // return connection to the pool + let mut total_conns; + let mut returned = false; + let mut per_db_size = 0; + { + let mut pool = pool.lock(); + total_conns = pool.total_conns; + + let pool_entries: &mut Vec = pool + .pools + .entry(conn_info.db_and_user()) + .or_insert_with(|| Vec::with_capacity(1)); + if total_conns < self.max_conns_per_endpoint { + pool_entries.push(ConnPoolEntry { + conn: client, + _last_access: std::time::Instant::now(), + }); + + total_conns += 1; + returned = true; + per_db_size = pool_entries.len(); + + pool.total_conns += 1; + } + } + + // do logging outside of the mutex + if returned { + info!("pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}"); + } else { + info!("pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); + } + + Ok(()) + } + + async fn get_endpoint_pool(&self, endpoint: &String) -> Arc> { + // find or create a pool for this endpoint + let mut created = false; + let mut global_pool = self.global_pool.lock(); + let pool = global_pool + .entry(endpoint.clone()) + .or_insert_with(|| { + created = true; + Arc::new(Mutex::new(EndpointConnPool { + pools: HashMap::new(), + total_conns: 0, + })) + }) + .clone(); + let global_pool_size = global_pool.len(); + drop(global_pool); + + // log new global pool size + if created { + info!( + "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}" + ); + } + + pool + } +} + +// +// Wake up the destination if needed. Code here is a bit involved because +// we reuse the code from the usual proxy and we need to prepare few structures +// that this code expects. +// +async fn connect_to_compute( + config: &config::ProxyConfig, + conn_info: &ConnInfo, +) -> anyhow::Result { + let tls = config.tls_config.as_ref(); + let common_names = tls.and_then(|tls| tls.common_names.clone()); + + let credential_params = StartupMessageParams::new([ + ("user", &conn_info.username), + ("database", &conn_info.dbname), + ("application_name", APP_NAME), + ]); + + let creds = config + .auth_backend + .as_ref() + .map(|_| { + auth::ClientCredentials::parse( + &credential_params, + Some(&conn_info.hostname), + common_names, + ) + }) + .transpose()?; + let extra = console::ConsoleReqExtra { + session_id: uuid::Uuid::new_v4(), + application_name: Some(APP_NAME), + }; + + let node_info = &mut creds.wake_compute(&extra).await?.expect("msg"); + + // This code is a copy of `connect_to_compute` from `src/proxy.rs` with + // the difference that it uses `tokio_postgres` for the connection. + let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE; + loop { + match connect_to_compute_once(node_info, conn_info).await { + Err(e) if num_retries > 0 => { + info!("compute node's state has changed; requesting a wake-up"); + match creds.wake_compute(&extra).await? { + // Update `node_info` and try one more time. + Some(new) => { + *node_info = new; + } + // Link auth doesn't work that way, so we just exit. + None => return Err(e), + } + } + other => return other, + } + + num_retries -= 1; + info!("retrying after wake-up ({num_retries} attempts left)"); + } +} + +async fn connect_to_compute_once( + node_info: &console::CachedNodeInfo, + conn_info: &ConnInfo, +) -> anyhow::Result { + let mut config = (*node_info.config).clone(); + + let (client, connection) = config + .user(&conn_info.username) + .password(&conn_info.password) + .dbname(&conn_info.dbname) + .max_backend_message_size(MAX_RESPONSE_SIZE) + .connect(tokio_postgres::NoTls) + .inspect_err(|e: &tokio_postgres::Error| { + error!( + "failed to connect to compute node hosts={:?} ports={:?}: {}", + node_info.config.get_hosts(), + node_info.config.get_ports(), + e + ); + invalidate_cache(node_info) + }) + .await?; + + tokio::spawn(async move { + if let Err(e) = connection.await { + error!("connection error: {}", e); + } + }); + + Ok(client) +} diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs index e8ad2d04f3..adf7252f72 100644 --- a/proxy/src/http/sql_over_http.rs +++ b/proxy/src/http/sql_over_http.rs @@ -1,25 +1,21 @@ +use std::sync::Arc; + use futures::pin_mut; use futures::StreamExt; -use futures::TryFutureExt; use hyper::body::HttpBody; use hyper::http::HeaderName; use hyper::http::HeaderValue; use hyper::{Body, HeaderMap, Request}; -use pq_proto::StartupMessageParams; use serde_json::json; use serde_json::Map; use serde_json::Value; use tokio_postgres::types::Kind; use tokio_postgres::types::Type; use tokio_postgres::Row; -use tracing::error; -use tracing::info; -use tracing::instrument; use url::Url; -use crate::proxy::invalidate_cache; -use crate::proxy::NUM_RETRIES_WAKE_COMPUTE; -use crate::{auth, config::ProxyConfig, console}; +use super::conn_pool::ConnInfo; +use super::conn_pool::GlobalConnPool; #[derive(serde::Deserialize)] struct QueryData { @@ -27,12 +23,13 @@ struct QueryData { params: Vec, } -const APP_NAME: &str = "sql_over_http"; -const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB +pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); +static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in"); + static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true"); // @@ -96,13 +93,6 @@ fn json_array_to_pg_array(value: &Value) -> Result, serde_json::E } } -struct ConnInfo { - username: String, - dbname: String, - hostname: String, - password: String, -} - fn get_conn_info( headers: &HeaderMap, sni_hostname: Option, @@ -169,50 +159,23 @@ fn get_conn_info( // TODO: return different http error codes pub async fn handle( - config: &'static ProxyConfig, request: Request, sni_hostname: Option, + conn_pool: Arc, ) -> anyhow::Result { // // Determine the destination and connection params // let headers = request.headers(); let conn_info = get_conn_info(headers, sni_hostname)?; - let credential_params = StartupMessageParams::new([ - ("user", &conn_info.username), - ("database", &conn_info.dbname), - ("application_name", APP_NAME), - ]); // Determine the output options. Default behaviour is 'false'. Anything that is not // strictly 'true' assumed to be false. let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); - // - // Wake up the destination if needed. Code here is a bit involved because - // we reuse the code from the usual proxy and we need to prepare few structures - // that this code expects. - // - let tls = config.tls_config.as_ref(); - let common_names = tls.and_then(|tls| tls.common_names.clone()); - let creds = config - .auth_backend - .as_ref() - .map(|_| { - auth::ClientCredentials::parse( - &credential_params, - Some(&conn_info.hostname), - common_names, - ) - }) - .transpose()?; - let extra = console::ConsoleReqExtra { - session_id: uuid::Uuid::new_v4(), - application_name: Some(APP_NAME), - }; - - let mut node_info = creds.wake_compute(&extra).await?.expect("msg"); + // Allow connection pooling only if explicitly requested + let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); let request_content_length = match request.body().size_hint().upper() { Some(v) => v, @@ -235,7 +198,8 @@ pub async fn handle( // // Now execute the query and return the result // - let client = connect_to_compute(&mut node_info, &extra, &creds, &conn_info).await?; + let client = conn_pool.get(&conn_info, !allow_pool).await?; + let row_stream = client.query_raw_txt(query, query_params).await?; // Manually drain the stream into a vector to leave row_stream hanging @@ -292,6 +256,13 @@ pub async fn handle( .map(|row| pg_text_row_to_json(row, raw_output, array_mode)) .collect::, _>>()?; + if allow_pool { + // return connection to the pool + tokio::task::spawn(async move { + let _ = conn_pool.put(&conn_info, client).await; + }); + } + // resulting JSON format is based on the format of node-postgres result Ok(json!({ "command": command_tag_name, @@ -302,70 +273,6 @@ pub async fn handle( })) } -/// This function is a copy of `connect_to_compute` from `src/proxy.rs` with -/// the difference that it uses `tokio_postgres` for the connection. -#[instrument(skip_all)] -async fn connect_to_compute( - node_info: &mut console::CachedNodeInfo, - extra: &console::ConsoleReqExtra<'_>, - creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>, - conn_info: &ConnInfo, -) -> anyhow::Result { - let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE; - - loop { - match connect_to_compute_once(node_info, conn_info).await { - Err(e) if num_retries > 0 => { - info!("compute node's state has changed; requesting a wake-up"); - match creds.wake_compute(extra).await? { - // Update `node_info` and try one more time. - Some(new) => { - *node_info = new; - } - // Link auth doesn't work that way, so we just exit. - None => return Err(e), - } - } - other => return other, - } - - num_retries -= 1; - info!("retrying after wake-up ({num_retries} attempts left)"); - } -} - -async fn connect_to_compute_once( - node_info: &console::CachedNodeInfo, - conn_info: &ConnInfo, -) -> anyhow::Result { - let mut config = (*node_info.config).clone(); - - let (client, connection) = config - .user(&conn_info.username) - .password(&conn_info.password) - .dbname(&conn_info.dbname) - .max_backend_message_size(MAX_RESPONSE_SIZE) - .connect(tokio_postgres::NoTls) - .inspect_err(|e: &tokio_postgres::Error| { - error!( - "failed to connect to compute node hosts={:?} ports={:?}: {}", - node_info.config.get_hosts(), - node_info.config.get_ports(), - e - ); - invalidate_cache(node_info) - }) - .await?; - - tokio::spawn(async move { - if let Err(e) = connection.await { - error!("connection error: {}", e); - } - }); - - Ok(client) -} - // // Convert postgres row with text-encoded values to JSON object // diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs index 9f467aceb7..83ba034e57 100644 --- a/proxy/src/http/websocket.rs +++ b/proxy/src/http/websocket.rs @@ -35,7 +35,7 @@ use utils::http::{error::ApiError, json::json_response}; // Tracking issue: https://github.com/rust-lang/rust/issues/98407. use sync_wrapper::SyncWrapper; -use super::sql_over_http; +use super::{conn_pool::GlobalConnPool, sql_over_http}; pin_project! { /// This is a wrapper around a [`WebSocketStream`] that @@ -164,6 +164,7 @@ async fn serve_websocket( async fn ws_handler( mut request: Request, config: &'static ProxyConfig, + conn_pool: Arc, cancel_map: Arc, session_id: uuid::Uuid, sni_hostname: Option, @@ -192,7 +193,7 @@ async fn ws_handler( // TODO: that deserves a refactor as now this function also handles http json client besides websockets. // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead. } else if request.uri().path() == "/sql" && request.method() == Method::POST { - let result = sql_over_http::handle(config, request, sni_hostname) + let result = sql_over_http::handle(request, sni_hostname, conn_pool) .instrument(info_span!("sql-over-http")) .await; let status_code = match result { @@ -234,6 +235,8 @@ pub async fn task_main( info!("websocket server has shut down"); } + let conn_pool: Arc = GlobalConnPool::new(config); + let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config()); let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config { Some(config) => config.into(), @@ -258,15 +261,18 @@ pub async fn task_main( let make_svc = hyper::service::make_service_fn(|stream: &tokio_rustls::server::TlsStream| { let sni_name = stream.get_ref().1.sni_hostname().map(|s| s.to_string()); + let conn_pool = conn_pool.clone(); async move { Ok::<_, Infallible>(hyper::service::service_fn(move |req: Request| { let sni_name = sni_name.clone(); + let conn_pool = conn_pool.clone(); + async move { let cancel_map = Arc::new(CancelMap::default()); let session_id = uuid::Uuid::new_v4(); - ws_handler(req, config, cancel_map, session_id, sni_name) + ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name) .instrument(info_span!( "ws-client", session = format_args!("{session_id}") diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 0c8c2858b9..3405b8cbc6 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -18,7 +18,7 @@ pub async fn init() -> anyhow::Result { .from_env_lossy(); let fmt_layer = tracing_subscriber::fmt::layer() - .with_ansi(atty::is(atty::Stream::Stderr)) + .with_ansi(false) .with_writer(std::io::stderr) .with_target(false); diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 6ae1e3a447..00fd7f0405 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -4,11 +4,13 @@ use crate::{config::MetricCollectionConfig, http}; use chrono::{DateTime, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use serde::Serialize; -use std::collections::HashMap; +use std::{collections::HashMap, time::Duration}; use tracing::{error, info, instrument, trace, warn}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; +const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); + /// /// Key that uniquely identifies the object, this metric describes. /// Currently, endpoint_id is enough, but this may change later, @@ -30,7 +32,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> { info!("metrics collector has shut down"); } - let http_client = http::new_client(); + let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT); let mut cached_metrics: HashMap)> = HashMap::new(); let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); @@ -182,36 +184,36 @@ async fn collect_metrics_iteration( } }; - if res.status().is_success() { - // update cached metrics after they were sent successfully - for send_metric in chunk { - let stop_time = match send_metric.kind { - EventType::Incremental { stop_time, .. } => stop_time, - _ => unreachable!(), - }; - - cached_metrics - .entry(Ids { - endpoint_id: send_metric.extra.endpoint_id.clone(), - branch_id: send_metric.extra.branch_id.clone(), - }) - // update cached value (add delta) and time - .and_modify(|e| { - e.0 = e.0.saturating_add(send_metric.value); - e.1 = stop_time - }) - // cache new metric - .or_insert((send_metric.value, stop_time)); - } - } else { + if !res.status().is_success() { error!("metrics endpoint refused the sent metrics: {:?}", res); - for metric in chunk.iter() { + for metric in chunk.iter().filter(|metric| metric.value > (1u64 << 40)) { // Report if the metric value is suspiciously large - if metric.value > (1u64 << 40) { - error!("potentially abnormal metric value: {:?}", metric); - } + error!("potentially abnormal metric value: {:?}", metric); } } + // update cached metrics after they were sent + // (to avoid sending the same metrics twice) + // see the relevant discussion on why to do so even if the status is not success: + // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956 + for send_metric in chunk { + let stop_time = match send_metric.kind { + EventType::Incremental { stop_time, .. } => stop_time, + _ => unreachable!(), + }; + + cached_metrics + .entry(Ids { + endpoint_id: send_metric.extra.endpoint_id.clone(), + branch_id: send_metric.extra.branch_id.clone(), + }) + // update cached value (add delta) and time + .and_modify(|e| { + e.0 = e.0.saturating_add(send_metric.value); + e.1 = stop_time + }) + // cache new metric + .or_insert((send_metric.value, stop_time)); + } } Ok(()) } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 8efb7005c8..2433412c4c 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -16,7 +16,10 @@ use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCou use once_cell::sync::Lazy; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use std::sync::Arc; -use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use tokio::{ + io::{AsyncRead, AsyncWrite, AsyncWriteExt}, + time, +}; use tokio_util::sync::CancellationToken; use tracing::{error, info, warn}; use utils::measured_stream::MeasuredStream; @@ -305,12 +308,13 @@ pub fn invalidate_cache(node_info: &console::CachedNodeInfo) { #[tracing::instrument(name = "connect_once", skip_all)] async fn connect_to_compute_once( node_info: &console::CachedNodeInfo, + timeout: time::Duration, ) -> Result { let allow_self_signed_compute = node_info.allow_self_signed_compute; node_info .config - .connect(allow_self_signed_compute) + .connect(allow_self_signed_compute, timeout) .inspect_err(|_: &compute::ConnectionError| invalidate_cache(node_info)) .await } @@ -328,7 +332,27 @@ async fn connect_to_compute( loop { // Apply startup params to the (possibly, cached) compute node info. node_info.config.set_startup_params(params); - match connect_to_compute_once(node_info).await { + + // Set a shorter timeout for the initial connection attempt. + // + // In case we try to connect to an outdated address that is no longer valid, the + // default behavior of Kubernetes is to drop the packets, causing us to wait for + // the entire timeout period. We want to fail fast in such cases. + // + // A specific case to consider is when we have cached compute node information + // with a 4-minute TTL (Time To Live), but the user has executed a `/suspend` API + // call, resulting in the nonexistence of the compute node. + // + // We only use caching in case of scram proxy backed by the console, so reduce + // the timeout only in that case. + let is_scram_proxy = matches!(creds, auth::BackendType::Console(_, _)); + let timeout = if is_scram_proxy && num_retries == NUM_RETRIES_WAKE_COMPUTE { + time::Duration::from_secs(2) + } else { + time::Duration::from_secs(10) + }; + + match connect_to_compute_once(node_info, timeout).await { Err(e) if num_retries > 0 => { info!("compute node's state has changed; requesting a wake-up"); match creds.wake_compute(extra).map_err(io_error).await? { diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index 7cc4191435..85854427ed 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -45,17 +45,74 @@ fn hmac_sha256<'a>(key: &[u8], parts: impl IntoIterator) -> [u8 let mut mac = Hmac::::new_from_slice(key).expect("bad key size"); parts.into_iter().for_each(|s| mac.update(s)); - // TODO: maybe newer `hmac` et al already migrated to regular arrays? - let mut result = [0u8; 32]; - result.copy_from_slice(mac.finalize().into_bytes().as_slice()); - result + mac.finalize().into_bytes().into() } fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { let mut hasher = Sha256::new(); parts.into_iter().for_each(|s| hasher.update(s)); - let mut result = [0u8; 32]; - result.copy_from_slice(hasher.finalize().as_slice()); - result + hasher.finalize().into() +} + +#[cfg(test)] +mod tests { + use crate::sasl::{Mechanism, Step}; + + use super::{password::SaltedPassword, Exchange, ServerSecret}; + + #[test] + fn happy_path() { + let iterations = 4096; + let salt_base64 = "QSXCR+Q6sek8bf92"; + let pw = SaltedPassword::new( + b"pencil", + base64::decode(salt_base64).unwrap().as_slice(), + iterations, + ); + + let secret = ServerSecret { + iterations, + salt_base64: salt_base64.to_owned(), + stored_key: pw.client_key().sha256(), + server_key: pw.server_key(), + doomed: false, + }; + const NONCE: [u8; 18] = [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + ]; + let mut exchange = Exchange::new(&secret, || NONCE, None); + + let client_first = "n,,n=user,r=rOprNGfwEbeRWgbNEkqO"; + let client_final = "c=biws,r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,p=rw1r5Kph5ThxmaUBC2GAQ6MfXbPnNkFiTIvdb/Rear0="; + let server_first = + "r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,s=QSXCR+Q6sek8bf92,i=4096"; + let server_final = "v=qtUDIofVnIhM7tKn93EQUUt5vgMOldcDVu1HC+OH0o0="; + + exchange = match exchange.exchange(client_first).unwrap() { + Step::Continue(exchange, message) => { + assert_eq!(message, server_first); + exchange + } + Step::Success(_, _) => panic!("expected continue, got success"), + Step::Failure(f) => panic!("{f}"), + }; + + let key = match exchange.exchange(client_final).unwrap() { + Step::Success(key, message) => { + assert_eq!(message, server_final); + key + } + Step::Continue(_, _) => panic!("expected success, got continue"), + Step::Failure(f) => panic!("{f}"), + }; + + assert_eq!( + key.as_bytes(), + [ + 74, 103, 1, 132, 12, 31, 200, 48, 28, 54, 82, 232, 207, 12, 138, 189, 40, 32, 134, + 27, 125, 170, 232, 35, 171, 167, 166, 41, 70, 228, 182, 112, + ] + ); + } } diff --git a/proxy/src/scram/password.rs b/proxy/src/scram/password.rs index 656780d853..022f2842dd 100644 --- a/proxy/src/scram/password.rs +++ b/proxy/src/scram/password.rs @@ -14,19 +14,7 @@ impl SaltedPassword { /// See `scram-common.c : scram_SaltedPassword` for details. /// Further reading: (see `PBKDF2`). pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword { - let one = 1_u32.to_be_bytes(); // magic - - let mut current = super::hmac_sha256(password, [salt, &one]); - let mut result = current; - for _ in 1..iterations { - current = super::hmac_sha256(password, [current.as_ref()]); - // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094 - for (i, x) in current.iter().enumerate() { - result[i] ^= x; - } - } - - result.into() + pbkdf2::pbkdf2_hmac_array::(password, salt, iterations).into() } /// Derive `ClientKey` from a salted hashed password. @@ -46,3 +34,41 @@ impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword { Self { bytes } } } + +#[cfg(test)] +mod tests { + use super::SaltedPassword; + + fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword { + let one = 1_u32.to_be_bytes(); // magic + + let mut current = super::super::hmac_sha256(password, [salt, &one]); + let mut result = current; + for _ in 1..iterations { + current = super::super::hmac_sha256(password, [current.as_ref()]); + // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094 + for (i, x) in current.iter().enumerate() { + result[i] ^= x; + } + } + + result.into() + } + + #[test] + fn pbkdf2() { + let password = "a-very-secure-password"; + let salt = "such-a-random-salt"; + let iterations = 4096; + let output = [ + 203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211, + 101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42, + ]; + + let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations); + let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations); + + assert_eq!(actual.bytes, output); + assert_eq!(actual.bytes, expected.bytes); + } +} diff --git a/pyproject.toml b/pyproject.toml index 2c21af6982..ac4e8fa2dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ authors = [] [tool.poetry.dependencies] python = "^3.9" pytest = "^7.3.1" -psycopg2-binary = "^2.9.1" +psycopg2-binary = "^2.9.6" typing-extensions = "^4.6.1" PyJWT = {version = "^2.1.0", extras = ["crypto"]} requests = "^2.31.0" diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js index 432c78d1af..dd60d42a37 100755 --- a/scripts/comment-test-report.js +++ b/scripts/comment-test-report.js @@ -144,17 +144,24 @@ const reportSummary = async (params) => { } summary += `- \`${testName}\`: ${links.join(", ")}\n` } - - const testsToRerun = Object.values(failedTests[pgVersion]).map(x => x[0].name) - const command = `DEFAULT_PG_VERSION=${pgVersion} scripts/pytest -k "${testsToRerun.join(" or ")}"` - - summary += "```\n" - summary += `# Run failed on Postgres ${pgVersion} tests locally:\n` - summary += `${command}\n` - summary += "```\n" } } + if (failedTestsCount > 0) { + const testsToRerun = [] + for (const pgVersion of Object.keys(failedTests)) { + for (const testName of Object.keys(failedTests[pgVersion])) { + testsToRerun.push(...failedTests[pgVersion][testName].map(test => test.name)) + } + } + const command = `scripts/pytest -vv -n $(nproc) -k "${testsToRerun.join(" or ")}"` + + summary += "```\n" + summary += `# Run all failed tests locally:\n` + summary += `${command}\n` + summary += "```\n" + } + if (flakyTestsCount > 0) { summary += `
\nFlaky tests (${flakyTestsCount})\n\n` for (const pgVersion of Array.from(pgVersions).sort().reverse()) { @@ -164,8 +171,7 @@ const reportSummary = async (params) => { const links = [] for (const test of tests) { const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries` - const status = test.status === "passed" ? ":white_check_mark:" : ":x:" - links.push(`[${status} ${test.buildType}](${allureLink})`) + links.push(`[${test.buildType}](${allureLink})`) } summary += `- \`${testName}\`: ${links.join(", ")}\n` } diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 4e649e111a..1c36c1ed02 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -1,6 +1,6 @@ pytest_plugins = ( "fixtures.pg_version", - "fixtures.allure", + "fixtures.parametrize", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", diff --git a/test_runner/fixtures/allure.py b/test_runner/fixtures/allure.py deleted file mode 100644 index 6f40bd2aa2..0000000000 --- a/test_runner/fixtures/allure.py +++ /dev/null @@ -1,25 +0,0 @@ -import os - -import pytest - -from fixtures.pg_version import DEFAULT_VERSION, PgVersion - -""" -Set of utilities to make Allure report more informative. - -- It adds BUILD_TYPE and DEFAULT_PG_VERSION to the test names (only in test_runner/regress) -to make tests distinguishable in Allure report. -""" - - -@pytest.fixture(scope="function", autouse=True) -def allure_noop(): - pass - - -def pytest_generate_tests(metafunc): - if "test_runner/regress" in metafunc.definition._nodeid: - build_type = os.environ.get("BUILD_TYPE", "DEBUG").lower() - pg_version = PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) - - metafunc.parametrize("allure_noop", [f"{build_type}-pg{pg_version}"]) diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 7ee3c33f92..a2bc2e28e5 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -59,6 +59,8 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "libmetrics_tracing_event_count_total", "pageserver_materialized_cache_hits_total", "pageserver_materialized_cache_hits_direct_total", + "pageserver_page_cache_read_hits_total", + "pageserver_page_cache_read_accesses_total", "pageserver_getpage_reconstruct_seconds_bucket", "pageserver_getpage_reconstruct_seconds_count", "pageserver_getpage_reconstruct_seconds_sum", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index e56bf78019..c3e9853978 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -102,8 +102,8 @@ def base_dir() -> Iterator[Path]: yield base_dir -@pytest.fixture(scope="session") -def neon_binpath(base_dir: Path) -> Iterator[Path]: +@pytest.fixture(scope="function") +def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]: if os.getenv("REMOTE_ENV"): # we are in remote env and do not have neon binaries locally # this is the case for benchmarks run on self-hosted runner @@ -113,7 +113,6 @@ def neon_binpath(base_dir: Path) -> Iterator[Path]: if env_neon_bin := os.environ.get("NEON_BIN"): binpath = Path(env_neon_bin) else: - build_type = os.environ.get("BUILD_TYPE", "debug") binpath = base_dir / "target" / build_type log.info(f"neon_binpath is {binpath}") @@ -123,7 +122,7 @@ def neon_binpath(base_dir: Path) -> Iterator[Path]: yield binpath -@pytest.fixture(scope="session") +@pytest.fixture(scope="function") def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"): distrib_dir = Path(env_postgres_bin).resolve() @@ -147,7 +146,7 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]: yield output_dir -@pytest.fixture(scope="session") +@pytest.fixture(scope="function") def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]: versioned_dir = pg_distrib_dir / pg_version.v_prefixed @@ -174,7 +173,23 @@ def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "fu def myfixture(...) ... """ - return "function" if os.environ.get("TEST_SHARED_FIXTURES") is None else "session" + scope: Literal["session", "function"] + + if os.environ.get("TEST_SHARED_FIXTURES") is None: + # Create the environment in the per-test output directory + scope = "function" + elif ( + os.environ.get("BUILD_TYPE") is not None + and os.environ.get("DEFAULT_PG_VERSION") is not None + ): + scope = "session" + else: + pytest.fail( + "Shared environment(TEST_SHARED_FIXTURES) requires BUILD_TYPE and DEFAULT_PG_VERSION to be set", + pytrace=False, + ) + + return scope @pytest.fixture(scope="session") diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 5c4f5177d0..824d11cb17 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -21,6 +21,18 @@ class PageserverApiException(Exception): self.status_code = status_code +class TimelineCreate406(PageserverApiException): + def __init__(self, res: requests.Response): + assert res.status_code == 406 + super().__init__(res.json()["msg"], res.status_code) + + +class TimelineCreate409(PageserverApiException): + def __init__(self, res: requests.Response): + assert res.status_code == 409 + super().__init__("", res.status_code) + + @dataclass class InMemoryLayerInfo: kind: str @@ -309,9 +321,12 @@ class PageserverHttpClient(requests.Session): res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body, **kwargs ) - self.verbose_error(res) if res.status_code == 409: - raise Exception(f"could not create timeline: already exists for id {new_timeline_id}") + raise TimelineCreate409(res) + if res.status_code == 406: + raise TimelineCreate406(res) + + self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py new file mode 100644 index 0000000000..53350138dd --- /dev/null +++ b/test_runner/fixtures/parametrize.py @@ -0,0 +1,50 @@ +import os +from typing import Optional + +import pytest +from _pytest.fixtures import FixtureRequest +from _pytest.python import Metafunc + +from fixtures.pg_version import PgVersion + +""" +Dynamically parametrize tests by Postgres version and build type (debug/release/remote) +""" + + +@pytest.fixture(scope="function", autouse=True) +def pg_version(request: FixtureRequest) -> Optional[PgVersion]: + # Do not parametrize performance tests yet, we need to prepare grafana charts first + if "test_runner/performance" in str(request.node.path): + v = os.environ.get("DEFAULT_PG_VERSION") + return PgVersion(v) + + return None + + +@pytest.fixture(scope="function", autouse=True) +def build_type(request: FixtureRequest) -> Optional[str]: + # Do not parametrize performance tests yet, we need to prepare grafana charts first + if "test_runner/performance" in str(request.node.path): + return os.environ.get("BUILD_TYPE", "").lower() + + return None + + +def pytest_generate_tests(metafunc: Metafunc): + # Do not parametrize performance tests yet, we need to prepare grafana charts first + if "test_runner/performance" in metafunc.definition._nodeid: + return + + if (v := os.environ.get("DEFAULT_PG_VERSION")) is None: + pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET] + else: + pg_versions = [PgVersion(v)] + + if (bt := os.environ.get("BUILD_TYPE")) is None: + build_types = ["debug", "release"] + else: + build_types = [bt.lower()] + + metafunc.parametrize("build_type", build_types) + metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions)) diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index 14ae88cc2c..b61f52be3c 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -1,12 +1,10 @@ import enum import os -from typing import Iterator, Optional +from typing import Optional import pytest +from _pytest.config import Config from _pytest.config.argparsing import Parser -from pytest import FixtureRequest - -from fixtures.log_helper import log """ This fixture is used to determine which version of Postgres to use for tests. @@ -75,18 +73,10 @@ def pytest_addoption(parser: Parser): "--pg-version", action="store", type=PgVersion, - help="Postgres version to use for tests", + help="DEPRECATED: Postgres version to use for tests", ) -@pytest.fixture(scope="session") -def pg_version(request: FixtureRequest) -> Iterator[PgVersion]: - if v := request.config.getoption("--pg-version"): - version, source = v, "from --pg-version command-line argument" - elif v := os.environ.get("DEFAULT_PG_VERSION"): - version, source = PgVersion(v), "from DEFAULT_PG_VERSION environment variable" - else: - version, source = DEFAULT_VERSION, "default version" - - log.info(f"pg_version is {version} ({source})") - yield version +def pytest_configure(config: Config): + if config.getoption("--pg-version"): + raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead") diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index 8babbbe132..4744c1ed2e 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -52,6 +52,7 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc "wait_for_spec_ms": f"{i}_wait_for_spec", "sync_safekeepers_ms": f"{i}_sync_safekeepers", "basebackup_ms": f"{i}_basebackup", + "start_postgres_ms": f"{i}_start_postgres", "config_ms": f"{i}_config", "total_startup_ms": f"{i}_total_startup", } diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index 4a03421fcf..53e67b1592 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -4,7 +4,8 @@ import time import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv -from fixtures.types import Lsn +from fixtures.pageserver.http import TimelineCreate406 +from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar @@ -173,5 +174,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC. with pytest.raises(Exception, match="invalid branch start lsn: .*"): env.neon_cli.create_branch("b1", "b0", tenant_id=tenant, ancestor_start_lsn=lsn) + # retry the same with the HTTP API, so that we can inspect the status code + with pytest.raises(TimelineCreate406): + new_timeline_id = TimelineId.generate() + log.info( + f"Expecting failure for branch behind gc'ing LSN, new_timeline_id={new_timeline_id}" + ) + pageserver_http_client.timeline_create(env.pg_version, tenant, new_timeline_id, b0, lsn) thread.join() diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 3f7d49ab03..a19b2862f8 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -1,6 +1,7 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.pageserver.http import TimelineCreate406 from fixtures.types import Lsn, TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -17,7 +18,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*") # Branch at the point where only 100 rows were inserted - env.neon_cli.create_branch("test_branch_behind") + branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind") endpoint_main = env.endpoints.create_start("test_branch_behind") log.info("postgres is running on 'test_branch_behind' branch") @@ -89,6 +90,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): assert query_scalar(main_cur, "SELECT count(*) FROM foo") == 400100 # Check bad lsn's for branching + pageserver_http = env.pageserver.http_client() # branch at segment boundary env.neon_cli.create_branch( @@ -97,27 +99,52 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): endpoint = env.endpoints.create_start("test_branch_segment_boundary") assert endpoint.safe_psql("SELECT 1")[0][0] == 1 - # branch at pre-initdb lsn + # branch at pre-initdb lsn (from main branch) with pytest.raises(Exception, match="invalid branch start lsn: .*"): env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn=Lsn("0/42")) + # retry the same with the HTTP API, so that we can inspect the status code + with pytest.raises(TimelineCreate406): + new_timeline_id = TimelineId.generate() + log.info(f"Expecting failure for branch pre-initdb LSN, new_timeline_id={new_timeline_id}") + pageserver_http.timeline_create( + env.pg_version, env.initial_tenant, new_timeline_id, env.initial_timeline, Lsn("0/42") + ) # branch at pre-ancestor lsn with pytest.raises(Exception, match="less than timeline ancestor lsn"): env.neon_cli.create_branch( "test_branch_preinitdb", "test_branch_behind", ancestor_start_lsn=Lsn("0/42") ) + # retry the same with the HTTP API, so that we can inspect the status code + with pytest.raises(TimelineCreate406): + new_timeline_id = TimelineId.generate() + log.info( + f"Expecting failure for branch pre-ancestor LSN, new_timeline_id={new_timeline_id}" + ) + pageserver_http.timeline_create( + env.pg_version, + env.initial_tenant, + new_timeline_id, + branch_behind_timeline_id, + Lsn("0/42"), + ) # check that we cannot create branch based on garbage collected data - with env.pageserver.http_client() as pageserver_http: - pageserver_http.timeline_checkpoint(env.initial_tenant, timeline) - gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) - print_gc_result(gc_result) - + pageserver_http.timeline_checkpoint(env.initial_tenant, timeline) + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) with pytest.raises(Exception, match="invalid branch start lsn: .*"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail env.neon_cli.create_branch( "test_branch_create_fail", "test_branch_behind", ancestor_start_lsn=gced_lsn ) + # retry the same with the HTTP API, so that we can inspect the status code + with pytest.raises(TimelineCreate406): + new_timeline_id = TimelineId.generate() + log.info(f"Expecting failure for branch behind gc'd LSN, new_timeline_id={new_timeline_id}") + pageserver_http.timeline_create( + env.pg_version, env.initial_tenant, new_timeline_id, branch_behind_timeline_id, gced_lsn + ) # check that after gc everything is still there assert query_scalar(hundred_cur, "SELECT count(*) FROM foo") == 100 diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index 6bfa8fdbe7..ebd836ecbc 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -33,6 +33,7 @@ def handle_role(dbs, roles, operation): dbs[db] = operation["name"] if "password" in operation: roles[operation["name"]] = operation["password"] + assert "encrypted_password" in operation elif operation["op"] == "del": if "old_name" in operation: roles.pop(operation["old_name"]) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 0ec023b9e1..fc2d29c2c1 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -1,7 +1,6 @@ import shutil import time from dataclasses import dataclass -from pathlib import Path from typing import Dict, Tuple import pytest @@ -428,14 +427,14 @@ def poor_mans_du( largest_layer = 0 smallest_layer = None for tenant_id, timeline_id in timelines: - dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) - assert dir.exists(), f"timeline dir does not exist: {dir}" - sum = 0 - for file in dir.iterdir(): + timeline_dir = env.timeline_dir(tenant_id, timeline_id) + assert timeline_dir.exists(), f"timeline dir does not exist: {timeline_dir}" + total = 0 + for file in timeline_dir.iterdir(): if "__" not in file.name: continue size = file.stat().st_size - sum += size + total += size largest_layer = max(largest_layer, size) if smallest_layer: smallest_layer = min(smallest_layer, size) @@ -443,8 +442,8 @@ def poor_mans_du( smallest_layer = size log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}") - log.info(f"{tenant_id}/{timeline_id}: sum {sum}") - total_on_disk += sum + log.info(f"{tenant_id}/{timeline_id}: sum {total}") + total_on_disk += total assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0 return (total_on_disk, largest_layer, smallest_layer or 0) diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index b22e545f20..d4f0c94a29 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -58,7 +58,7 @@ def test_basic_eviction( for sk in env.safekeepers: sk.stop() - timeline_path = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + timeline_path = env.timeline_dir(tenant_id, timeline_id) initial_local_layers = sorted( list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) ) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index c26ec76172..a30f0f02e1 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -713,9 +713,7 @@ def test_ondemand_download_failure_to_replace( # error message is not useful pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2) - actual_message = ( - ".* ERROR .*replacing downloaded layer into layermap failed because layer was not found" - ) + actual_message = ".* ERROR .*layermap-replace-notfound" assert env.pageserver.log_contains(actual_message) is not None env.pageserver.allowed_errors.append(actual_message) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index f2b954a822..9a70266314 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -535,7 +535,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( "pitr_interval": "0s", } ) - timeline_path = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + timeline_path = env.timeline_dir(tenant_id, timeline_id) client = env.pageserver.http_client() diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 2a015d5d17..2ded79954e 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -632,14 +632,14 @@ def test_ignored_tenant_download_missing_layers( # ignore the tenant and remove its layers pageserver_http.tenant_ignore(tenant_id) - tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + timeline_dir = env.timeline_dir(tenant_id, timeline_id) layers_removed = False - for dir_entry in tenant_timeline_dir.iterdir(): + for dir_entry in timeline_dir.iterdir(): if dir_entry.name.startswith("00000"): # Looks like a layer file. Remove it dir_entry.unlink() layers_removed = True - assert layers_removed, f"Found no layers for tenant {tenant_timeline_dir}" + assert layers_removed, f"Found no layers for tenant {timeline_dir}" # now, load it from the local files and expect it to work due to remote storage restoration pageserver_http.tenant_load(tenant_id=tenant_id) @@ -688,14 +688,14 @@ def test_ignored_tenant_stays_broken_without_metadata( # ignore the tenant and remove its metadata pageserver_http.tenant_ignore(tenant_id) - tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + timeline_dir = env.timeline_dir(tenant_id, timeline_id) metadata_removed = False - for dir_entry in tenant_timeline_dir.iterdir(): + for dir_entry in timeline_dir.iterdir(): if dir_entry.name == "metadata": # Looks like a layer file. Remove it dir_entry.unlink() metadata_removed = True - assert metadata_removed, f"Failed to find metadata file in {tenant_timeline_dir}" + assert metadata_removed, f"Failed to find metadata file in {timeline_dir}" env.pageserver.allowed_errors.append( f".*{tenant_id}.*: load failed.*: failed to load metadata.*" diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 2a5b30803b..9043c29060 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -214,9 +214,7 @@ def switch_pg_to_new_pageserver( endpoint.start() - timeline_to_detach_local_path = ( - env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) - ) + timeline_to_detach_local_path = env.timeline_dir(tenant_id, timeline_id) files_before_detach = os.listdir(timeline_to_detach_local_path) assert ( "metadata" in files_before_detach @@ -419,8 +417,6 @@ def test_tenant_relocation( new_pageserver_http.tenant_attach(tenant_id) # wait for tenant to finish attaching - tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id) - assert tenant_status["state"]["slug"] in ["Attaching", "Active"] wait_until( number_of_iterations=10, interval=1, diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index dca2cd3d28..98f9e94276 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -257,7 +257,7 @@ def test_tenant_redownloads_truncated_file_on_startup( env.endpoints.stop_all() env.pageserver.stop() - timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + timeline_dir = env.timeline_dir(tenant_id, timeline_id) local_layer_truncated = None for path in Path.iterdir(timeline_dir): if path.name.startswith("00000"): diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index ddd9ffd755..7c3424cf32 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -463,10 +463,10 @@ def test_concurrent_timeline_delete_stuck_on( # make the second call and assert behavior log.info("second call start") - error_msg_re = "timeline deletion is already in progress" + error_msg_re = "Timeline deletion is already in progress" with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err: ps_http.timeline_delete(env.initial_tenant, child_timeline_id) - assert second_call_err.value.status_code == 500 + assert second_call_err.value.status_code == 409 env.pageserver.allowed_errors.append(f".*{child_timeline_id}.*{error_msg_re}.*") # the second call will try to transition the timeline into Stopping state as well env.pageserver.allowed_errors.append( @@ -518,9 +518,9 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2) env.pageserver.allowed_errors.append( - f".*{child_timeline_id}.*timeline deletion is already in progress.*" + f".*{child_timeline_id}.*Timeline deletion is already in progress.*" ) - with pytest.raises(PageserverApiException, match="timeline deletion is already in progress"): + with pytest.raises(PageserverApiException, match="Timeline deletion is already in progress"): ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2) # make sure the timeout was due to the failpoint diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 5bdbc18927..6338f4ca77 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -416,6 +416,7 @@ def test_timeline_physical_size_post_compaction( wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id) # shutdown safekeepers to prevent new data from coming in + endpoint.stop() # We can't gracefully stop after safekeepers die for sk in env.safekeepers: sk.stop() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 994858edf7..5828d4306c 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1210,6 +1210,10 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): with conn.cursor() as cur: cur.execute("INSERT INTO t (key) VALUES (1)") + # Stop all computes gracefully before safekeepers stop responding to them + endpoint_1.stop_and_destroy() + endpoint_3.stop_and_destroy() + # Remove initial tenant's br1 (active) assert sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"] assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index a2daebc6b4..1144aee166 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit a2daebc6b445dcbcca9c18e1711f47c1db7ffb04 +Subproject commit 1144aee1661c79eec65e784a8dad8bd450d9df79 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 2df2ce3744..1984832c74 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 2df2ce374464a7449e15dfa46c956b73b4f4098b +Subproject commit 1984832c740a7fa0e468bb720f40c525b652835d diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 677b59f453..63a65d3889 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -54,7 +54,7 @@ toml_edit = { version = "0.19", features = ["serde"] } tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } -tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } +tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] } url = { version = "2", features = ["serde"] } [build-dependencies]