diff --git a/.dockerignore b/.dockerignore index 9fafc2e4ba..ffa72eaf51 100644 --- a/.dockerignore +++ b/.dockerignore @@ -19,6 +19,7 @@ !pageserver/ !pgxn/ !proxy/ +!object_storage/ !storage_scrubber/ !safekeeper/ !storage_broker/ diff --git a/.github/actionlint.yml b/.github/actionlint.yml index edc456d611..1d1b50e458 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -6,6 +6,7 @@ self-hosted-runner: - small - small-metal - small-arm64 + - unit-perf - us-east-2 config-variables: - AWS_ECR_REGION diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index b85ca7874d..c27311f24e 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -70,6 +70,7 @@ runs: - name: Install Allure shell: bash -euxo pipefail {0} + working-directory: /tmp run: | if ! which allure; then ALLURE_ZIP=allure-${ALLURE_VERSION}.zip diff --git a/.github/scripts/generate_image_maps.py b/.github/scripts/generate_image_maps.py index d8f910271b..d3ec048409 100644 --- a/.github/scripts/generate_image_maps.py +++ b/.github/scripts/generate_image_maps.py @@ -39,12 +39,18 @@ registries = { ], } +release_branches = ["release", "release-proxy", "release-compute"] + outputs: dict[str, dict[str, list[str]]] = {} -target_tags = [target_tag, "latest"] if branch == "main" else [target_tag] -target_stages = ( - ["dev", "prod"] if branch in ["release", "release-proxy", "release-compute"] else ["dev"] +target_tags = ( + [target_tag, "latest"] + if branch == "main" + else [target_tag, "released"] + if branch in release_branches + else [target_tag] ) +target_stages = ["dev", "prod"] if branch in release_branches else ["dev"] for component_name, component_images in components.items(): for stage in target_stages: diff --git a/.github/scripts/push_with_image_map.py b/.github/scripts/push_with_image_map.py index c68f6ad407..85e2eb1937 100644 --- a/.github/scripts/push_with_image_map.py +++ b/.github/scripts/push_with_image_map.py @@ -2,6 +2,9 @@ import json import os import subprocess +RED = "\033[91m" +RESET = "\033[0m" + image_map = os.getenv("IMAGE_MAP") if not image_map: raise ValueError("IMAGE_MAP environment variable is not set") @@ -11,12 +14,32 @@ try: except json.JSONDecodeError as e: raise ValueError("Failed to parse IMAGE_MAP as JSON") from e -for source, targets in parsed_image_map.items(): - for target in targets: - cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source] - print(f"Running: {' '.join(cmd)}") - result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) +failures = [] - if result.returncode != 0: - print(f"Error: {result.stdout}") - raise RuntimeError(f"Command failed: {' '.join(cmd)}") +pending = [(source, target) for source, targets in parsed_image_map.items() for target in targets] + +while len(pending) > 0: + if len(failures) > 10: + print("Error: more than 10 failures!") + for failure in failures: + print(f'"{failure[0]}" failed with the following output:') + print(failure[1]) + raise RuntimeError("Retry limit reached.") + + source, target = pending.pop(0) + cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source] + print(f"Running: {' '.join(cmd)}") + result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + if result.returncode != 0: + failures.append((" ".join(cmd), result.stdout, target)) + pending.append((source, target)) + print( + f"{RED}[RETRY]{RESET} Push failed for {target}. Retrying... (failure count: {len(failures)})" + ) + print(result.stdout) + +if len(failures) > 0 and (github_output := os.getenv("GITHUB_OUTPUT")): + failed_targets = [target for _, _, target in failures] + with open(github_output, "a") as f: + f.write(f"push_failures={json.dumps(failed_targets)}\n") diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml index 9a7da612d4..7d3a11409b 100644 --- a/.github/workflows/_push-to-container-registry.yml +++ b/.github/workflows/_push-to-container-registry.yml @@ -104,6 +104,25 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - name: Copy docker images to target registries + id: push run: python3 .github/scripts/push_with_image_map.py env: IMAGE_MAP: ${{ inputs.image-map }} + + - name: Notify Slack if container image pushing fails + if: steps.push.outputs.push_failures || failure() + uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload: | + channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }} + text: > + *Container image pushing ${{ + steps.push.outcome == 'failure' && 'failed completely' || 'succeeded with some retries' + }}* in + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + + ${{ steps.push.outputs.push_failures && format( + '*Failed targets:*\n• {0}', join(fromJson(steps.push.outputs.push_failures), '\n• ') + ) || '' }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7d35066616..80c4511b36 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -89,8 +89,8 @@ jobs: check-codestyle-python: needs: [ meta, check-permissions, build-build-tools-image ] - # No need to run on `main` because we this in the merge queue - if: ${{ needs.meta.outputs.run-kind == 'pr' }} + # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes. + if: ${{ contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_check-codestyle-python.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm @@ -98,7 +98,8 @@ jobs: check-codestyle-jsonnet: needs: [ meta, check-permissions, build-build-tools-image ] - if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }} + # We do need to run this in `.*-rc-pr` because of hotfixes. + if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} @@ -181,8 +182,8 @@ jobs: check-codestyle-rust: needs: [ meta, check-permissions, build-build-tools-image ] - # No need to run on `main` because we this in the merge queue - if: ${{ needs.meta.outputs.run-kind == 'pr' }} + # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes. + if: ${{ contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_check-codestyle-rust.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm @@ -191,7 +192,8 @@ jobs: check-dependencies-rust: needs: [ meta, files-changed, build-build-tools-image ] - if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr' }} + # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes. + if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/cargo-deny.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm @@ -199,7 +201,8 @@ jobs: build-and-test-locally: needs: [ meta, build-build-tools-image ] - if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }} + # We do need to run this in `.*-rc-pr` because of hotfixes. + if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} strategy: fail-fast: false matrix: @@ -281,7 +284,7 @@ jobs: statuses: write contents: write pull-requests: write - runs-on: [ self-hosted, small-metal ] + runs-on: [ self-hosted, unit-perf ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: @@ -977,7 +980,7 @@ jobs: TEST_EXTENSIONS_TAG: >- ${{ contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) - && 'latest' + && needs.meta.outputs.previous-compute-release || needs.meta.outputs.build-tag }} TEST_VERSION_ONLY: ${{ matrix.pg_version }} @@ -1268,7 +1271,7 @@ jobs: exit 1 deploy: - needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ] + needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, trigger-custom-extensions-build-and-wait ] # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod` if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }} permissions: @@ -1565,10 +1568,10 @@ jobs: if: | contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') - || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr') - || (needs.build-and-test-locally.result == 'skipped' && needs.meta.outputs.run-kind == 'pr') - || (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.run-kind == 'pr') - || (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.run-kind == 'pr') + || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) + || (needs.build-and-test-locally.result == 'skipped' && contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) + || (needs.check-codestyle-python.result == 'skipped' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) + || (needs.check-codestyle-rust.result == 'skipped' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || needs.files-changed.result == 'skipped' || (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)) diff --git a/.github/workflows/force-test-extensions-upgrade.yml b/.github/workflows/force-test-extensions-upgrade.yml index 896ec4a0c1..9c9357055d 100644 --- a/.github/workflows/force-test-extensions-upgrade.yml +++ b/.github/workflows/force-test-extensions-upgrade.yml @@ -55,7 +55,7 @@ jobs: echo tag=${tag} >> ${GITHUB_OUTPUT} - name: Test extension upgrade - timeout-minutes: 20 + timeout-minutes: 60 env: NEW_COMPUTE_TAG: latest OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} diff --git a/.github/workflows/report-workflow-stats-batch.yml b/.github/workflows/report-workflow-stats-batch.yml index 0f52d24787..6e5093ebd6 100644 --- a/.github/workflows/report-workflow-stats-batch.yml +++ b/.github/workflows/report-workflow-stats-batch.yml @@ -23,7 +23,7 @@ jobs: egress-policy: audit - name: Export Workflow Run for the past 2 hours - uses: neondatabase/gh-workflow-stats-action@4c998b25ab5cc6588b52a610b749531f6a566b6b # v0.2.1 + uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} db_table: "gh_workflow_stats_neon" @@ -43,7 +43,7 @@ jobs: egress-policy: audit - name: Export Workflow Run for the past 48 hours - uses: neondatabase/gh-workflow-stats-action@4c998b25ab5cc6588b52a610b749531f6a566b6b # v0.2.1 + uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} db_table: "gh_workflow_stats_neon" @@ -63,7 +63,7 @@ jobs: egress-policy: audit - name: Export Workflow Run for the past 30 days - uses: neondatabase/gh-workflow-stats-action@4c998b25ab5cc6588b52a610b749531f6a566b6b # v0.2.1 + uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} db_table: "gh_workflow_stats_neon" diff --git a/.gitignore b/.gitignore index a07a65ccef..45eb4dbf0e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +/artifact_cache /pg_install /target /tmp_check diff --git a/Cargo.lock b/Cargo.lock index 194ad90d52..5d2cdcea27 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2837,6 +2837,7 @@ dependencies = [ "utils", "uuid", "workspace_hack", + "x509-cert", ] [[package]] @@ -3991,6 +3992,33 @@ dependencies = [ "memchr", ] +[[package]] +name = "object_storage" +version = "0.0.1" +dependencies = [ + "anyhow", + "axum", + "axum-extra", + "camino", + "camino-tempfile", + "futures", + "http-body-util", + "itertools 0.10.5", + "jsonwebtoken", + "prometheus", + "rand 0.8.5", + "remote_storage", + "serde", + "serde_json", + "test-log", + "tokio", + "tokio-util", + "tower 0.5.2", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "once_cell" version = "1.20.2" @@ -4329,6 +4357,7 @@ dependencies = [ "strum", "strum_macros", "thiserror 1.0.69", + "tracing-utils", "utils", ] @@ -4692,7 +4721,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.6" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b" dependencies = [ "base64 0.22.1", "byteorder", @@ -4726,7 +4755,7 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.6" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b" dependencies = [ "bytes", "chrono", @@ -6924,6 +6953,28 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "test-log" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7f46083d221181166e5b6f6b1e5f1d499f3a76888826e6cb1d057554157cd0f" +dependencies = [ + "env_logger", + "test-log-macros", + "tracing-subscriber", +] + +[[package]] +name = "test-log-macros" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "888d0c3c6db53c0fdab160d2ed5e12ba745383d3e85813f2ea0f2b1475ab553f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -7115,9 +7166,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.43.0" +version = "1.43.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" +checksum = "492a604e2fd7f814268a378409e6c92b5525d747d10db9a229723f55a417958c" dependencies = [ "backtrace", "bytes", @@ -7171,7 +7222,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.10" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b" dependencies = [ "async-trait", "byteorder", @@ -7603,6 +7654,7 @@ dependencies = [ "opentelemetry-otlp", "opentelemetry-semantic-conventions", "opentelemetry_sdk", + "pin-project-lite", "tokio", "tracing", "tracing-opentelemetry", diff --git a/Cargo.toml b/Cargo.toml index 3fb9229da8..d957fa9070 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ members = [ "libs/proxy/postgres-protocol2", "libs/proxy/postgres-types2", "libs/proxy/tokio-postgres2", + "object_storage", ] [workspace.package] @@ -183,7 +184,7 @@ test-context = "0.3" thiserror = "1.0" tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] } tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] } -tokio = { version = "1.41", features = ["macros"] } +tokio = { version = "1.43.1", features = ["macros"] } tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } tokio-io-timeout = "1.2.0" tokio-postgres-rustls = "0.12.0" @@ -208,6 +209,7 @@ tracing-opentelemetry = "0.28" tracing-serde = "0.2.0" tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } try-lock = "0.2.5" +test-log = { version = "0.2.17", default-features = false, features = ["log"] } twox-hash = { version = "1.6.3", default-features = false } typed-json = "0.1" url = "2.2" diff --git a/Dockerfile b/Dockerfile index 01540e1925..848bfab921 100644 --- a/Dockerfile +++ b/Dockerfile @@ -89,6 +89,7 @@ RUN set -e \ --bin storage_broker \ --bin storage_controller \ --bin proxy \ + --bin object_storage \ --bin neon_local \ --bin storage_scrubber \ --locked --release @@ -121,6 +122,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/object_storage /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index c103ceaea5..7766991a0a 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -292,7 +292,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.85.0 +ENV RUSTC_VERSION=1.86.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index b1a6000a7e..261f647e8f 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -369,7 +369,7 @@ FROM build-deps AS plv8-src ARG PG_VERSION WORKDIR /ext-src -COPY compute/patches/plv8-3.1.10.patch . +COPY compute/patches/plv8* . # plv8 3.2.3 supports v17 # last release v3.2.3 - Sep 7, 2024 @@ -393,7 +393,7 @@ RUN case "${PG_VERSION:?}" in \ git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \ tar -czf plv8.tar.gz --exclude .git plv8-src && \ cd plv8-src && \ - if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi + if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8_v3.1.10.patch; else patch -p1 < /ext-src/plv8_v3.2.3.patch; fi # Step 1: Build the vendored V8 engine. It doesn't depend on PostgreSQL, so use # 'build-deps' as the base. This enables caching and avoids unnecessary rebuilds. @@ -1022,67 +1022,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control -######################################################################################### -# -# Layer "pg_embedding-build" -# compile pg_embedding extension -# -######################################################################################### -FROM build-deps AS pg_embedding-src -ARG PG_VERSION - -# This is our extension, support stopped in favor of pgvector -# TODO: deprecate it -WORKDIR /ext-src -RUN case "${PG_VERSION:?}" in \ - "v14" | "v15") \ - export PG_EMBEDDING_VERSION=0.3.5 \ - export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \ - ;; \ - *) \ - echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \ - esac && \ - wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \ - echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \ - mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . - -FROM pg-build AS pg_embedding-build -COPY --from=pg_embedding-src /ext-src/ /ext-src/ -WORKDIR /ext-src/ -RUN if [ -d pg_embedding-src ]; then \ - cd pg_embedding-src && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install; \ - fi - -######################################################################################### -# -# Layer "pg_anon-build" -# compile anon extension -# -######################################################################################### -FROM build-deps AS pg_anon-src -ARG PG_VERSION - -# This is an experimental extension, never got to real production. -# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. -WORKDIR /ext-src -RUN case "${PG_VERSION:?}" in "v17") \ - echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ - esac && \ - wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ - echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ - mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . - -FROM pg-build AS pg_anon-build -COPY --from=pg_anon-src /ext-src/ /ext-src/ -WORKDIR /ext-src -RUN if [ -d pg_anon-src ]; then \ - cd pg_anon-src && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; \ - fi - ######################################################################################### # # Layer "pg build with nonroot user and cargo installed" @@ -1366,8 +1305,8 @@ ARG PG_VERSION # Do not update without approve from proxy team # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs WORKDIR /ext-src -RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \ - echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.0.tar.gz -O pg_session_jwt.tar.gz && \ + echo "19be2dc0b3834d643706ed430af998bb4c2cdf24b3c45e7b102bb3a550e8660c pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \ @@ -1675,9 +1614,7 @@ COPY --from=rdkit-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg_embedding-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql -COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -1853,7 +1790,6 @@ COPY --from=pg_cron-src /ext-src/ /ext-src/ COPY --from=pg_uuidv7-src /ext-src/ /ext-src/ COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/ COPY --from=pg_semver-src /ext-src/ /ext-src/ -#COPY --from=pg_embedding-src /ext-src/ /ext-src/ #COPY --from=wal2json-src /ext-src/ /ext-src/ COPY --from=pg_ivm-src /ext-src/ /ext-src/ COPY --from=pg_partman-src /ext-src/ /ext-src/ diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index da2b86d542..449e1199d0 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -33,6 +33,7 @@ import 'sql_exporter/lfc_hits.libsonnet', import 'sql_exporter/lfc_misses.libsonnet', import 'sql_exporter/lfc_used.libsonnet', + import 'sql_exporter/lfc_used_pages.libsonnet', import 'sql_exporter/lfc_writes.libsonnet', import 'sql_exporter/logical_slot_restart_lsn.libsonnet', import 'sql_exporter/max_cluster_size.libsonnet', diff --git a/compute/etc/sql_exporter/lfc_used_pages.libsonnet b/compute/etc/sql_exporter/lfc_used_pages.libsonnet new file mode 100644 index 0000000000..1e39a93482 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_used_pages.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_used_pages', + type: 'gauge', + help: 'LFC pages used', + key_labels: null, + values: [ + 'lfc_used_pages', + ], + query: importstr 'sql_exporter/lfc_used_pages.sql', +} diff --git a/compute/etc/sql_exporter/lfc_used_pages.sql b/compute/etc/sql_exporter/lfc_used_pages.sql new file mode 100644 index 0000000000..56d92f8514 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_used_pages.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_used_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used_pages'; diff --git a/compute/patches/cloud_regress_pg16.patch b/compute/patches/cloud_regress_pg16.patch index 3f0bb84ae7..ae415a5412 100644 --- a/compute/patches/cloud_regress_pg16.patch +++ b/compute/patches/cloud_regress_pg16.patch @@ -202,10 +202,10 @@ index cf0b80d616..e8e2a14a4a 100644 COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment'; ERROR: must be owner of relation constraint_comments_tbl diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out -index 442e7aff2b..525f732b03 100644 +index d785f92561..16377e5ac9 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out -@@ -8,7 +8,7 @@ +@@ -15,7 +15,7 @@ SELECT FROM test_enc_setup(); CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea) AS :'regresslib', 'test_enc_conversion' LANGUAGE C STRICT; @@ -587,16 +587,15 @@ index f551624afb..57f1e432d4 100644 SELECT * INTO TABLE ramp diff --git a/src/test/regress/expected/database.out b/src/test/regress/expected/database.out -index 454db91ec0..01378d7081 100644 +index 4cbdbdf84d..573362850e 100644 --- a/src/test/regress/expected/database.out +++ b/src/test/regress/expected/database.out -@@ -1,8 +1,7 @@ +@@ -1,8 +1,6 @@ CREATE DATABASE regression_tbd ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0; ALTER DATABASE regression_tbd RENAME TO regression_utf8; -ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace; -ALTER DATABASE regression_utf8 RESET TABLESPACE; -+WARNING: you need to manually restart any running background workers after this command ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123; -- Test PgDatabaseToastTable. Doing this with GRANT would be slow. BEGIN; @@ -700,7 +699,7 @@ index 6ed50fdcfa..caa00a345d 100644 COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator; diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out -index 6b8c2f2414..8e13b7fa46 100644 +index 84745b9f60..4883c12351 100644 --- a/src/test/regress/expected/foreign_key.out +++ b/src/test/regress/expected/foreign_key.out @@ -1985,7 +1985,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES @@ -1112,7 +1111,7 @@ index 8475231735..0653946337 100644 DROP ROLE regress_passwd_sha_len1; DROP ROLE regress_passwd_sha_len2; diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out -index 5b9dba7b32..cc408dad42 100644 +index 620fbe8c52..0570102357 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3 @@ -1174,8 +1173,8 @@ index 5b9dba7b32..cc408dad42 100644 +CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2; ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4; GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1; - SET SESSION AUTHORIZATION regress_priv_user1; -@@ -239,12 +239,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre + SET SESSION AUTHORIZATION regress_priv_user3; +@@ -246,12 +246,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre ERROR: permission denied to grant privileges as role "regress_priv_role" DETAIL: The grantor must have the ADMIN option on role "regress_priv_role". GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE; @@ -1192,7 +1191,7 @@ index 5b9dba7b32..cc408dad42 100644 DROP ROLE regress_priv_role; SET SESSION AUTHORIZATION regress_priv_user1; SELECT session_user, current_user; -@@ -1776,7 +1780,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP +@@ -1783,7 +1787,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP -- security-restricted operations \c - @@ -1201,7 +1200,7 @@ index 5b9dba7b32..cc408dad42 100644 -- Check that index expressions and predicates are run as the table's owner -- A dummy index function checking current_user CREATE FUNCTION sro_ifun(int) RETURNS int AS $$ -@@ -2668,8 +2672,8 @@ drop cascades to function testns.priv_testagg(integer) +@@ -2675,8 +2679,8 @@ drop cascades to function testns.priv_testagg(integer) drop cascades to function testns.priv_testproc(integer) -- Change owner of the schema & and rename of new schema owner \c - @@ -1212,7 +1211,7 @@ index 5b9dba7b32..cc408dad42 100644 SET SESSION ROLE regress_schemauser1; CREATE SCHEMA testns; SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid; -@@ -2792,7 +2796,7 @@ DROP USER regress_priv_user7; +@@ -2799,7 +2803,7 @@ DROP USER regress_priv_user7; DROP USER regress_priv_user8; -- does not exist ERROR: role "regress_priv_user8" does not exist -- permissions with LOCK TABLE @@ -1221,7 +1220,7 @@ index 5b9dba7b32..cc408dad42 100644 CREATE TABLE lock_table (a int); -- LOCK TABLE and SELECT permission GRANT SELECT ON lock_table TO regress_locktable_user; -@@ -2874,7 +2878,7 @@ DROP USER regress_locktable_user; +@@ -2881,7 +2885,7 @@ DROP USER regress_locktable_user; -- pg_backend_memory_contexts. -- switch to superuser \c - @@ -1230,7 +1229,7 @@ index 5b9dba7b32..cc408dad42 100644 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no has_table_privilege --------------------- -@@ -2918,10 +2922,10 @@ RESET ROLE; +@@ -2925,10 +2929,10 @@ RESET ROLE; -- clean up DROP ROLE regress_readallstats; -- test role grantor machinery @@ -1245,7 +1244,7 @@ index 5b9dba7b32..cc408dad42 100644 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE; GRANT regress_group_direct_manager TO regress_group_indirect_manager; SET SESSION AUTHORIZATION regress_group_direct_manager; -@@ -2950,9 +2954,9 @@ DROP ROLE regress_group_direct_manager; +@@ -2957,9 +2961,9 @@ DROP ROLE regress_group_direct_manager; DROP ROLE regress_group_indirect_manager; DROP ROLE regress_group_member; -- test SET and INHERIT options with object ownership changes @@ -1841,7 +1840,7 @@ index 09a255649b..15895f0c53 100644 CREATE TABLE ruletest_t2 (x int); CREATE VIEW ruletest_v1 WITH (security_invoker=true) AS diff --git a/src/test/regress/expected/security_label.out b/src/test/regress/expected/security_label.out -index a8e01a6220..5a9cef4ede 100644 +index a8e01a6220..83543b250a 100644 --- a/src/test/regress/expected/security_label.out +++ b/src/test/regress/expected/security_label.out @@ -6,8 +6,8 @@ SET client_min_messages TO 'warning'; @@ -1855,34 +1854,6 @@ index a8e01a6220..5a9cef4ede 100644 CREATE TABLE seclabel_tbl1 (a int, b text); CREATE TABLE seclabel_tbl2 (x int, y text); CREATE VIEW seclabel_view1 AS SELECT * FROM seclabel_tbl2; -@@ -19,21 +19,21 @@ ALTER TABLE seclabel_tbl2 OWNER TO regress_seclabel_user2; - -- Test of SECURITY LABEL statement without a plugin - -- - SECURITY LABEL ON TABLE seclabel_tbl1 IS 'classified'; -- fail --ERROR: no security label providers have been loaded -+ERROR: must specify provider when multiple security label providers have been loaded - SECURITY LABEL FOR 'dummy' ON TABLE seclabel_tbl1 IS 'classified'; -- fail - ERROR: security label provider "dummy" is not loaded - SECURITY LABEL ON TABLE seclabel_tbl1 IS '...invalid label...'; -- fail --ERROR: no security label providers have been loaded -+ERROR: must specify provider when multiple security label providers have been loaded - SECURITY LABEL ON TABLE seclabel_tbl3 IS 'unclassified'; -- fail --ERROR: no security label providers have been loaded -+ERROR: must specify provider when multiple security label providers have been loaded - SECURITY LABEL ON ROLE regress_seclabel_user1 IS 'classified'; -- fail --ERROR: no security label providers have been loaded -+ERROR: must specify provider when multiple security label providers have been loaded - SECURITY LABEL FOR 'dummy' ON ROLE regress_seclabel_user1 IS 'classified'; -- fail - ERROR: security label provider "dummy" is not loaded - SECURITY LABEL ON ROLE regress_seclabel_user1 IS '...invalid label...'; -- fail --ERROR: no security label providers have been loaded -+ERROR: must specify provider when multiple security label providers have been loaded - SECURITY LABEL ON ROLE regress_seclabel_user3 IS 'unclassified'; -- fail --ERROR: no security label providers have been loaded -+ERROR: must specify provider when multiple security label providers have been loaded - -- clean up objects - DROP FUNCTION seclabel_four(); - DROP DOMAIN seclabel_domain; diff --git a/src/test/regress/expected/select_into.out b/src/test/regress/expected/select_into.out index b79fe9a1c0..e29fab88ab 100644 --- a/src/test/regress/expected/select_into.out @@ -2413,10 +2384,10 @@ index e3e3bea709..fa86ddc326 100644 COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment'; COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS 'no, another comment'; diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql -index 9a65fca91f..58431a3056 100644 +index b567a1a572..4d1ac2e631 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql -@@ -12,7 +12,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r +@@ -17,7 +17,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r AS :'regresslib', 'test_enc_conversion' LANGUAGE C STRICT; @@ -2780,7 +2751,7 @@ index ae6841308b..47bc792e30 100644 SELECT * diff --git a/src/test/regress/sql/database.sql b/src/test/regress/sql/database.sql -index 0367c0e37a..a23b98c4bd 100644 +index 46ad263478..eb05584ed5 100644 --- a/src/test/regress/sql/database.sql +++ b/src/test/regress/sql/database.sql @@ -1,8 +1,6 @@ @@ -2893,7 +2864,7 @@ index aa147b14a9..370e0dd570 100644 CREATE FOREIGN DATA WRAPPER dummy; COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql -index 45c7a534cb..32dd26b8cd 100644 +index 9f4210b26e..620d3fc87e 100644 --- a/src/test/regress/sql/foreign_key.sql +++ b/src/test/regress/sql/foreign_key.sql @@ -1435,7 +1435,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES @@ -3246,7 +3217,7 @@ index 53e86b0b6c..0303fdfe96 100644 -- Check that the invalid secrets were re-hashed. A re-hashed secret -- should not contain the original salt. diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql -index 249df17a58..b258e7f26a 100644 +index 259f1aedd1..6e1a3d17b7 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -24,18 +24,18 @@ RESET client_min_messages; @@ -3308,7 +3279,7 @@ index 249df17a58..b258e7f26a 100644 ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4; -@@ -1157,7 +1157,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP +@@ -1160,7 +1160,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP -- security-restricted operations \c - @@ -3317,7 +3288,7 @@ index 249df17a58..b258e7f26a 100644 -- Check that index expressions and predicates are run as the table's owner -@@ -1653,8 +1653,8 @@ DROP SCHEMA testns CASCADE; +@@ -1656,8 +1656,8 @@ DROP SCHEMA testns CASCADE; -- Change owner of the schema & and rename of new schema owner \c - @@ -3328,7 +3299,7 @@ index 249df17a58..b258e7f26a 100644 SET SESSION ROLE regress_schemauser1; CREATE SCHEMA testns; -@@ -1748,7 +1748,7 @@ DROP USER regress_priv_user8; -- does not exist +@@ -1751,7 +1751,7 @@ DROP USER regress_priv_user8; -- does not exist -- permissions with LOCK TABLE @@ -3337,7 +3308,7 @@ index 249df17a58..b258e7f26a 100644 CREATE TABLE lock_table (a int); -- LOCK TABLE and SELECT permission -@@ -1836,7 +1836,7 @@ DROP USER regress_locktable_user; +@@ -1839,7 +1839,7 @@ DROP USER regress_locktable_user; -- switch to superuser \c - @@ -3346,7 +3317,7 @@ index 249df17a58..b258e7f26a 100644 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no -@@ -1856,10 +1856,10 @@ RESET ROLE; +@@ -1859,10 +1859,10 @@ RESET ROLE; DROP ROLE regress_readallstats; -- test role grantor machinery @@ -3361,7 +3332,7 @@ index 249df17a58..b258e7f26a 100644 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE; GRANT regress_group_direct_manager TO regress_group_indirect_manager; -@@ -1881,9 +1881,9 @@ DROP ROLE regress_group_indirect_manager; +@@ -1884,9 +1884,9 @@ DROP ROLE regress_group_indirect_manager; DROP ROLE regress_group_member; -- test SET and INHERIT options with object ownership changes diff --git a/compute/patches/cloud_regress_pg17.patch b/compute/patches/cloud_regress_pg17.patch index e57447a2c6..4f10f8563a 100644 --- a/compute/patches/cloud_regress_pg17.patch +++ b/compute/patches/cloud_regress_pg17.patch @@ -202,10 +202,10 @@ index cf0b80d616..e8e2a14a4a 100644 COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment'; ERROR: must be owner of relation constraint_comments_tbl diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out -index 442e7aff2b..525f732b03 100644 +index d785f92561..16377e5ac9 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out -@@ -8,7 +8,7 @@ +@@ -15,7 +15,7 @@ SELECT FROM test_enc_setup(); CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea) AS :'regresslib', 'test_enc_conversion' LANGUAGE C STRICT; @@ -587,16 +587,15 @@ index f551624afb..57f1e432d4 100644 SELECT * INTO TABLE ramp diff --git a/src/test/regress/expected/database.out b/src/test/regress/expected/database.out -index 454db91ec0..01378d7081 100644 +index 4cbdbdf84d..573362850e 100644 --- a/src/test/regress/expected/database.out +++ b/src/test/regress/expected/database.out -@@ -1,8 +1,7 @@ +@@ -1,8 +1,6 @@ CREATE DATABASE regression_tbd ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0; ALTER DATABASE regression_tbd RENAME TO regression_utf8; -ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace; -ALTER DATABASE regression_utf8 RESET TABLESPACE; -+WARNING: you need to manually restart any running background workers after this command ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123; -- Test PgDatabaseToastTable. Doing this with GRANT would be slow. BEGIN; @@ -700,7 +699,7 @@ index 6ed50fdcfa..caa00a345d 100644 COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator; diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out -index 69994c98e3..129abcfbe8 100644 +index fe6a1015f2..614b387b7d 100644 --- a/src/test/regress/expected/foreign_key.out +++ b/src/test/regress/expected/foreign_key.out @@ -1985,7 +1985,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES @@ -1147,7 +1146,7 @@ index 924d6e001d..7fdda73439 100644 DROP ROLE regress_passwd_sha_len1; DROP ROLE regress_passwd_sha_len2; diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out -index 1296da0d57..f43fffa44c 100644 +index e8c668e0a1..03be5c2120 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3 @@ -1209,8 +1208,8 @@ index 1296da0d57..f43fffa44c 100644 +CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2; ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4; GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1; - SET SESSION AUTHORIZATION regress_priv_user1; -@@ -239,12 +239,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre + SET SESSION AUTHORIZATION regress_priv_user3; +@@ -246,12 +246,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre ERROR: permission denied to grant privileges as role "regress_priv_role" DETAIL: The grantor must have the ADMIN option on role "regress_priv_role". GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE; @@ -1227,7 +1226,7 @@ index 1296da0d57..f43fffa44c 100644 DROP ROLE regress_priv_role; SET SESSION AUTHORIZATION regress_priv_user1; SELECT session_user, current_user; -@@ -1776,7 +1780,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP +@@ -1783,7 +1787,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP -- security-restricted operations \c - @@ -1236,7 +1235,7 @@ index 1296da0d57..f43fffa44c 100644 -- Check that index expressions and predicates are run as the table's owner -- A dummy index function checking current_user CREATE FUNCTION sro_ifun(int) RETURNS int AS $$ -@@ -2668,8 +2672,8 @@ drop cascades to function testns.priv_testagg(integer) +@@ -2675,8 +2679,8 @@ drop cascades to function testns.priv_testagg(integer) drop cascades to function testns.priv_testproc(integer) -- Change owner of the schema & and rename of new schema owner \c - @@ -1247,7 +1246,7 @@ index 1296da0d57..f43fffa44c 100644 SET SESSION ROLE regress_schemauser1; CREATE SCHEMA testns; SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid; -@@ -2792,7 +2796,7 @@ DROP USER regress_priv_user7; +@@ -2799,7 +2803,7 @@ DROP USER regress_priv_user7; DROP USER regress_priv_user8; -- does not exist ERROR: role "regress_priv_user8" does not exist -- permissions with LOCK TABLE @@ -1256,7 +1255,7 @@ index 1296da0d57..f43fffa44c 100644 CREATE TABLE lock_table (a int); -- LOCK TABLE and SELECT permission GRANT SELECT ON lock_table TO regress_locktable_user; -@@ -2888,7 +2892,7 @@ DROP USER regress_locktable_user; +@@ -2895,7 +2899,7 @@ DROP USER regress_locktable_user; -- pg_backend_memory_contexts. -- switch to superuser \c - @@ -1265,7 +1264,7 @@ index 1296da0d57..f43fffa44c 100644 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no has_table_privilege --------------------- -@@ -2932,10 +2936,10 @@ RESET ROLE; +@@ -2939,10 +2943,10 @@ RESET ROLE; -- clean up DROP ROLE regress_readallstats; -- test role grantor machinery @@ -1280,7 +1279,7 @@ index 1296da0d57..f43fffa44c 100644 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE; GRANT regress_group_direct_manager TO regress_group_indirect_manager; SET SESSION AUTHORIZATION regress_group_direct_manager; -@@ -2964,9 +2968,9 @@ DROP ROLE regress_group_direct_manager; +@@ -2971,9 +2975,9 @@ DROP ROLE regress_group_direct_manager; DROP ROLE regress_group_indirect_manager; DROP ROLE regress_group_member; -- test SET and INHERIT options with object ownership changes @@ -1293,7 +1292,7 @@ index 1296da0d57..f43fffa44c 100644 CREATE SCHEMA regress_roleoption; GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC; GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE; -@@ -2995,9 +2999,9 @@ DROP ROLE regress_roleoption_protagonist; +@@ -3002,9 +3006,9 @@ DROP ROLE regress_roleoption_protagonist; DROP ROLE regress_roleoption_donor; DROP ROLE regress_roleoption_recipient; -- MAINTAIN @@ -2433,10 +2432,10 @@ index e3e3bea709..fa86ddc326 100644 COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment'; COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS 'no, another comment'; diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql -index 9a65fca91f..58431a3056 100644 +index b567a1a572..4d1ac2e631 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql -@@ -12,7 +12,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r +@@ -17,7 +17,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r AS :'regresslib', 'test_enc_conversion' LANGUAGE C STRICT; @@ -2800,7 +2799,7 @@ index ae6841308b..47bc792e30 100644 SELECT * diff --git a/src/test/regress/sql/database.sql b/src/test/regress/sql/database.sql -index 0367c0e37a..a23b98c4bd 100644 +index 46ad263478..eb05584ed5 100644 --- a/src/test/regress/sql/database.sql +++ b/src/test/regress/sql/database.sql @@ -1,8 +1,6 @@ @@ -2913,7 +2912,7 @@ index aa147b14a9..370e0dd570 100644 CREATE FOREIGN DATA WRAPPER dummy; COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless'; diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql -index 2e710e419c..89cd481a54 100644 +index 8c4e4c7c83..e946cd2119 100644 --- a/src/test/regress/sql/foreign_key.sql +++ b/src/test/regress/sql/foreign_key.sql @@ -1435,7 +1435,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES @@ -3301,7 +3300,7 @@ index bb82aa4aa2..dd8a05e24d 100644 -- Check that the invalid secrets were re-hashed. A re-hashed secret -- should not contain the original salt. diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql -index 5880bc018d..27aa952b18 100644 +index b7e1cb6cdd..6e5a2217f1 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -24,18 +24,18 @@ RESET client_min_messages; @@ -3363,7 +3362,7 @@ index 5880bc018d..27aa952b18 100644 ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4; -@@ -1157,7 +1157,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP +@@ -1160,7 +1160,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP -- security-restricted operations \c - @@ -3372,7 +3371,7 @@ index 5880bc018d..27aa952b18 100644 -- Check that index expressions and predicates are run as the table's owner -@@ -1653,8 +1653,8 @@ DROP SCHEMA testns CASCADE; +@@ -1656,8 +1656,8 @@ DROP SCHEMA testns CASCADE; -- Change owner of the schema & and rename of new schema owner \c - @@ -3383,7 +3382,7 @@ index 5880bc018d..27aa952b18 100644 SET SESSION ROLE regress_schemauser1; CREATE SCHEMA testns; -@@ -1748,7 +1748,7 @@ DROP USER regress_priv_user8; -- does not exist +@@ -1751,7 +1751,7 @@ DROP USER regress_priv_user8; -- does not exist -- permissions with LOCK TABLE @@ -3392,7 +3391,7 @@ index 5880bc018d..27aa952b18 100644 CREATE TABLE lock_table (a int); -- LOCK TABLE and SELECT permission -@@ -1851,7 +1851,7 @@ DROP USER regress_locktable_user; +@@ -1854,7 +1854,7 @@ DROP USER regress_locktable_user; -- switch to superuser \c - @@ -3401,7 +3400,7 @@ index 5880bc018d..27aa952b18 100644 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no -@@ -1871,10 +1871,10 @@ RESET ROLE; +@@ -1874,10 +1874,10 @@ RESET ROLE; DROP ROLE regress_readallstats; -- test role grantor machinery @@ -3416,7 +3415,7 @@ index 5880bc018d..27aa952b18 100644 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE; GRANT regress_group_direct_manager TO regress_group_indirect_manager; -@@ -1896,9 +1896,9 @@ DROP ROLE regress_group_indirect_manager; +@@ -1899,9 +1899,9 @@ DROP ROLE regress_group_indirect_manager; DROP ROLE regress_group_member; -- test SET and INHERIT options with object ownership changes @@ -3429,7 +3428,7 @@ index 5880bc018d..27aa952b18 100644 CREATE SCHEMA regress_roleoption; GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC; GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE; -@@ -1926,9 +1926,9 @@ DROP ROLE regress_roleoption_donor; +@@ -1929,9 +1929,9 @@ DROP ROLE regress_roleoption_donor; DROP ROLE regress_roleoption_recipient; -- MAINTAIN diff --git a/compute/patches/pg_hint_plan_v16.patch b/compute/patches/pg_hint_plan_v16.patch index 1fc3ffa609..e9df2a3446 100644 --- a/compute/patches/pg_hint_plan_v16.patch +++ b/compute/patches/pg_hint_plan_v16.patch @@ -2,23 +2,6 @@ diff --git a/expected/ut-A.out b/expected/ut-A.out index da723b8..5328114 100644 --- a/expected/ut-A.out +++ b/expected/ut-A.out -@@ -9,13 +9,16 @@ SET search_path TO public; - ---- - -- No.A-1-1-3 - CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan - -- No.A-1-2-3 - DROP EXTENSION pg_hint_plan; - -- No.A-1-1-4 - CREATE SCHEMA other_schema; - CREATE EXTENSION pg_hint_plan SCHEMA other_schema; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan - ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan" - CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan - DROP SCHEMA other_schema; - ---- - ---- No. A-5-1 comment pattern @@ -3175,6 +3178,7 @@ SELECT s.query, s.calls FROM public.pg_stat_statements s JOIN pg_catalog.pg_database d @@ -27,18 +10,6 @@ index da723b8..5328114 100644 ORDER BY 1; query | calls --------------------------------------+------- -diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out -index d372459..6282afe 100644 ---- a/expected/ut-fdw.out -+++ b/expected/ut-fdw.out -@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on; - SET client_min_messages TO LOG; - SET pg_hint_plan.enable_hint TO on; - CREATE EXTENSION file_fdw; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw - CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw; - CREATE USER MAPPING FOR PUBLIC SERVER file_server; - CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename'); diff --git a/sql/ut-A.sql b/sql/ut-A.sql index 7c7d58a..4fd1a07 100644 --- a/sql/ut-A.sql diff --git a/compute/patches/pg_hint_plan_v17.patch b/compute/patches/pg_hint_plan_v17.patch index 3442a094eb..a244452cfe 100644 --- a/compute/patches/pg_hint_plan_v17.patch +++ b/compute/patches/pg_hint_plan_v17.patch @@ -1,24 +1,3 @@ -diff --git a/expected/ut-A.out b/expected/ut-A.out -index e7d68a1..65a056c 100644 ---- a/expected/ut-A.out -+++ b/expected/ut-A.out -@@ -9,13 +9,16 @@ SET search_path TO public; - ---- - -- No.A-1-1-3 - CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan - -- No.A-1-2-3 - DROP EXTENSION pg_hint_plan; - -- No.A-1-1-4 - CREATE SCHEMA other_schema; - CREATE EXTENSION pg_hint_plan SCHEMA other_schema; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan - ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan" - CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan - DROP SCHEMA other_schema; - ---- - ---- No. A-5-1 comment pattern diff --git a/expected/ut-J.out b/expected/ut-J.out index 2fa3c70..314e929 100644 --- a/expected/ut-J.out @@ -160,15 +139,3 @@ index a09bd34..0ad227c 100644 error hint: explain_filter -diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out -index 017fa4b..98d989b 100644 ---- a/expected/ut-fdw.out -+++ b/expected/ut-fdw.out -@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on; - SET client_min_messages TO LOG; - SET pg_hint_plan.enable_hint TO on; - CREATE EXTENSION file_fdw; -+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw - CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw; - CREATE USER MAPPING FOR PUBLIC SERVER file_server; - CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename'); diff --git a/compute/patches/plv8-3.1.10.patch b/compute/patches/plv8_v3.1.10.patch similarity index 80% rename from compute/patches/plv8-3.1.10.patch rename to compute/patches/plv8_v3.1.10.patch index 43cdb479f7..5cf96426d0 100644 --- a/compute/patches/plv8-3.1.10.patch +++ b/compute/patches/plv8_v3.1.10.patch @@ -1,12 +1,6 @@ -commit 46b38d3e46f9cd6c70d9b189dd6ff4abaa17cf5e -Author: Alexander Bayandin -Date: Sat Nov 30 18:29:32 2024 +0000 - - Fix v8 9.7.37 compilation on Debian 12 - diff --git a/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch new file mode 100644 -index 0000000..f0a5dc7 +index 0000000..fae1cb3 --- /dev/null +++ b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch @@ -0,0 +1,30 @@ @@ -35,8 +29,21 @@ index 0000000..f0a5dc7 +@@ -5,6 +5,7 @@ + #ifndef V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_ + #define V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_ -+ ++ ++#include + #include -+ ++ + #include "include/cppgc/prefinalizer.h" +diff --git a/plv8.cc b/plv8.cc +index c1ce883..6e47e94 100644 +--- a/plv8.cc ++++ b/plv8.cc +@@ -379,7 +379,7 @@ _PG_init(void) + NULL, + &plv8_v8_flags, + NULL, +- PGC_USERSET, 0, ++ PGC_SUSET, 0, + #if PG_VERSION_NUM >= 90100 + NULL, + #endif diff --git a/compute/patches/plv8_v3.2.3.patch b/compute/patches/plv8_v3.2.3.patch new file mode 100644 index 0000000000..5cf4ae2fa2 --- /dev/null +++ b/compute/patches/plv8_v3.2.3.patch @@ -0,0 +1,13 @@ +diff --git a/plv8.cc b/plv8.cc +index edfa2aa..623e7f2 100644 +--- a/plv8.cc ++++ b/plv8.cc +@@ -385,7 +385,7 @@ _PG_init(void) + NULL, + &plv8_v8_flags, + NULL, +- PGC_USERSET, 0, ++ PGC_SUSET, 0, + #if PG_VERSION_NUM >= 90100 + NULL, + #endif diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index fc7a3e2827..16fd51d79a 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -29,13 +29,12 @@ //! ```sh //! compute_ctl -D /var/db/postgres/compute \ //! -C 'postgresql://cloud_admin@localhost/postgres' \ -//! -S /var/db/postgres/specs/current.json \ +//! -c /var/db/postgres/configs/config.json \ //! -b /usr/local/bin/postgres \ //! -r http://pg-ext-s3-gateway \ //! ``` use std::ffi::OsString; use std::fs::File; -use std::path::Path; use std::process::exit; use std::sync::mpsc; use std::thread; @@ -43,9 +42,10 @@ use std::time::Duration; use anyhow::{Context, Result}; use clap::Parser; -use compute_api::responses::ComputeCtlConfig; -use compute_api::spec::ComputeSpec; -use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal}; +use compute_api::responses::ComputeConfig; +use compute_tools::compute::{ + BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal, +}; use compute_tools::extension_server::get_pg_version_string; use compute_tools::logger::*; use compute_tools::params::*; @@ -57,10 +57,6 @@ use tracing::{error, info}; use url::Url; use utils::failpoint_support; -// this is an arbitrary build tag. Fine as a default / for testing purposes -// in-case of not-set environment var -const BUILD_TAG_DEFAULT: &str = "latest"; - // Compatibility hack: if the control plane specified any remote-ext-config // use the default value for extension storage proxy gateway. // Remove this once the control plane is updated to pass the gateway URL @@ -120,16 +116,21 @@ struct Cli { #[arg(long)] pub set_disk_quota_for_fs: Option, - #[arg(short = 's', long = "spec", group = "spec")] - pub spec_json: Option, - - #[arg(short = 'S', long, group = "spec-path")] - pub spec_path: Option, + // TODO(tristan957): remove alias after compatibility tests are no longer + // an issue + #[arg(short = 'c', long, alias = "spec-path")] + pub config: Option, #[arg(short = 'i', long, group = "compute-id")] pub compute_id: String, - #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")] + #[arg( + short = 'p', + long, + conflicts_with = "config", + value_name = "CONTROL_PLANE_API_BASE_URL", + requires = "compute-id" + )] pub control_plane_uri: Option, } @@ -138,7 +139,7 @@ fn main() -> Result<()> { let scenario = failpoint_support::init(); - // For historical reasons, the main thread that processes the spec and launches postgres + // For historical reasons, the main thread that processes the config and launches postgres // is synchronous, but we always have this tokio runtime available and we "enter" it so // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...) // from all parts of compute_ctl. @@ -147,14 +148,14 @@ fn main() -> Result<()> { .build()?; let _rt_guard = runtime.enter(); - let build_tag = runtime.block_on(init())?; + runtime.block_on(init())?; // enable core dumping for all child processes setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?; - let cli_spec = try_spec_from_cli(&cli)?; + let config = get_config(&cli)?; let compute_node = ComputeNode::new( ComputeNodeParams { @@ -174,12 +175,8 @@ fn main() -> Result<()> { cgroup: cli.cgroup, #[cfg(target_os = "linux")] vm_monitor_addr: cli.vm_monitor_addr, - build_tag, - - live_config_allowed: cli_spec.live_config_allowed, }, - cli_spec.spec, - cli_spec.compute_ctl_config, + config, )?; let exit_code = compute_node.run()?; @@ -189,7 +186,7 @@ fn main() -> Result<()> { deinit_and_exit(exit_code); } -async fn init() -> Result { +async fn init() -> Result<()> { init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; @@ -199,45 +196,22 @@ async fn init() -> Result { } }); - let build_tag = option_env!("BUILD_TAG") - .unwrap_or(BUILD_TAG_DEFAULT) - .to_string(); - info!("build_tag: {build_tag}"); + info!("compute build_tag: {}", &BUILD_TAG.to_string()); - Ok(build_tag) + Ok(()) } -fn try_spec_from_cli(cli: &Cli) -> Result { - // First, try to get cluster spec from the cli argument - if let Some(ref spec_json) = cli.spec_json { - info!("got spec from cli argument {}", spec_json); - return Ok(CliSpecParams { - spec: Some(serde_json::from_str(spec_json)?), - compute_ctl_config: ComputeCtlConfig::default(), - live_config_allowed: false, - }); +fn get_config(cli: &Cli) -> Result { + // First, read the config from the path if provided + if let Some(ref config) = cli.config { + let file = File::open(config)?; + return Ok(serde_json::from_reader(&file)?); } - // Second, try to read it from the file if path is provided - if let Some(ref spec_path) = cli.spec_path { - let file = File::open(Path::new(spec_path))?; - return Ok(CliSpecParams { - spec: Some(serde_json::from_reader(file)?), - compute_ctl_config: ComputeCtlConfig::default(), - live_config_allowed: true, - }); - } - - if cli.control_plane_uri.is_none() { - panic!("must specify --control-plane-uri"); - }; - - match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) { - Ok(resp) => Ok(CliSpecParams { - spec: resp.0, - compute_ctl_config: resp.1, - live_config_allowed: true, - }), + // If the config wasn't provided in the CLI arguments, then retrieve it from + // the control plane + match get_config_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) { + Ok(config) => Ok(config), Err(e) => { error!( "cannot get response from control plane: {}\n\ @@ -249,14 +223,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result { } } -struct CliSpecParams { - /// If a spec was provided via CLI or file, the [`ComputeSpec`] - spec: Option, - #[allow(dead_code)] - compute_ctl_config: ComputeCtlConfig, - live_config_allowed: bool, -} - fn deinit_and_exit(exit_code: Option) -> ! { // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. Shutting down OTEL tracing provider may diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index db3e07e086..082ba62b8e 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -98,13 +98,15 @@ pub async fn get_database_schema( .kill_on_drop(true) .spawn()?; - let stdout = cmd.stdout.take().ok_or_else(|| { - std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.") - })?; + let stdout = cmd + .stdout + .take() + .ok_or_else(|| std::io::Error::other("Failed to capture stdout."))?; - let stderr = cmd.stderr.take().ok_or_else(|| { - std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.") - })?; + let stderr = cmd + .stderr + .take() + .ok_or_else(|| std::io::Error::other("Failed to capture stderr."))?; let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new()); let stderr_reader = BufReader::new(stderr); @@ -128,8 +130,7 @@ pub async fn get_database_schema( } }); - return Err(SchemaDumpError::IO(std::io::Error::new( - std::io::ErrorKind::Other, + return Err(SchemaDumpError::IO(std::io::Error::other( "failed to start pg_dump", ))); } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 4126835c1a..c7b4bdd240 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -11,7 +11,7 @@ use std::{env, fs}; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; -use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus}; +use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus}; use compute_api::spec::{ ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, }; @@ -20,6 +20,7 @@ use futures::future::join_all; use futures::stream::FuturesUnordered; use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; +use once_cell::sync::Lazy; use postgres; use postgres::NoTls; use postgres::error::SqlState; @@ -35,6 +36,7 @@ use crate::disk_quota::set_disk_quota; use crate::installed_extensions::get_installed_extensions; use crate::logger::startup_context_from_env; use crate::lsn_lease::launch_lsn_lease_bg_task_for_static; +use crate::metrics::COMPUTE_CTL_UP; use crate::monitor::launch_monitor; use crate::pg_helpers::*; use crate::rsyslog::{ @@ -49,6 +51,17 @@ use crate::{config, extension_server, local_proxy}; pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0); pub static PG_PID: AtomicU32 = AtomicU32::new(0); +// This is an arbitrary build tag. Fine as a default / for testing purposes +// in-case of not-set environment var +const BUILD_TAG_DEFAULT: &str = "latest"; +/// Build tag/version of the compute node binaries/image. It's tricky and ugly +/// to pass it everywhere as a part of `ComputeNodeParams`, so we use a +/// global static variable. +pub static BUILD_TAG: Lazy = Lazy::new(|| { + option_env!("BUILD_TAG") + .unwrap_or(BUILD_TAG_DEFAULT) + .to_string() +}); /// Static configuration params that don't change after startup. These mostly /// come from the CLI args, or are derived from them. @@ -72,7 +85,6 @@ pub struct ComputeNodeParams { pub pgdata: String, pub pgbin: String, pub pgversion: String, - pub build_tag: String, /// The port that the compute's external HTTP server listens on pub external_http_port: u16, @@ -81,20 +93,6 @@ pub struct ComputeNodeParams { /// the address of extension storage proxy gateway pub ext_remote_storage: Option, - - /// We should only allow live re- / configuration of the compute node if - /// it uses 'pull model', i.e. it can go to control-plane and fetch - /// the latest configuration. Otherwise, there could be a case: - /// - we start compute with some spec provided as argument - /// - we push new spec and it does reconfiguration - /// - but then something happens and compute pod / VM is destroyed, - /// so k8s controller starts it again with the **old** spec - /// - /// and the same for empty computes: - /// - we started compute without any spec - /// - we push spec and it does configuration - /// - but then it is restarted without any spec again - pub live_config_allowed: bool, } /// Compute node info shared across several `compute_ctl` threads. @@ -173,6 +171,11 @@ impl ComputeState { info!("Changing compute status from {} to {}", prev, status); self.status = status; state_changed.notify_all(); + + COMPUTE_CTL_UP.reset(); + COMPUTE_CTL_UP + .with_label_values(&[&BUILD_TAG, status.to_string().as_str()]) + .set(1); } pub fn set_failed_status(&mut self, err: anyhow::Error, state_changed: &Condvar) { @@ -300,11 +303,7 @@ struct StartVmMonitorResult { } impl ComputeNode { - pub fn new( - params: ComputeNodeParams, - cli_spec: Option, - compute_ctl_config: ComputeCtlConfig, - ) -> Result { + pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result { let connstr = params.connstr.as_str(); let conn_conf = postgres::config::Config::from_str(connstr) .context("cannot build postgres config from connstr")?; @@ -312,8 +311,8 @@ impl ComputeNode { .context("cannot build tokio postgres config from connstr")?; let mut new_state = ComputeState::new(); - if let Some(cli_spec) = cli_spec { - let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?; + if let Some(spec) = config.spec { + let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?; new_state.pspec = Some(pspec); } @@ -324,7 +323,7 @@ impl ComputeNode { state: Mutex::new(new_state), state_changed: Condvar::new(), ext_download_progress: RwLock::new(HashMap::new()), - compute_ctl_config, + compute_ctl_config: config.compute_ctl_config, }) } @@ -343,6 +342,14 @@ impl ComputeNode { this.prewarm_postgres()?; } + // Set the up metric with Empty status before starting the HTTP server. + // That way on the first metric scrape, an external observer will see us + // as 'up' and 'empty' (unless the compute was started with a spec or + // already configured by control plane). + COMPUTE_CTL_UP + .with_label_values(&[&BUILD_TAG, ComputeStatus::Empty.to_string().as_str()]) + .set(1); + // Launch the external HTTP server first, so that we can serve control plane // requests while configuration is still in progress. crate::http::server::Server::External { @@ -512,11 +519,14 @@ impl ComputeNode { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); info!( - "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}", + "starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}", pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), pspec.spec.operation_uuid.as_deref().unwrap_or("None"), pspec.tenant_id, pspec.timeline_id, + pspec.spec.project_id.as_deref().unwrap_or("None"), + pspec.spec.branch_id.as_deref().unwrap_or("None"), + pspec.spec.endpoint_id.as_deref().unwrap_or("None"), pspec.spec.features, pspec.spec.remote_extensions, ); @@ -620,31 +630,28 @@ impl ComputeNode { }); } - // Configure and start rsyslog for HIPAA if necessary - if let ComputeAudit::Hipaa = pspec.spec.audit_log_level { - let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); - if remote_endpoint.is_empty() { - anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty"); + // Configure and start rsyslog for compliance audit logging + match pspec.spec.audit_log_level { + ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { + let remote_endpoint = + std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); + if remote_endpoint.is_empty() { + anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty"); + } + + let log_directory_path = Path::new(&self.params.pgdata).join("log"); + let log_directory_path = log_directory_path.to_string_lossy().to_string(); + configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?; + + // Launch a background task to clean up the audit logs + launch_pgaudit_gc(log_directory_path); } - - let log_directory_path = Path::new(&self.params.pgdata).join("log"); - let log_directory_path = log_directory_path.to_string_lossy().to_string(); - configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?; - - // Launch a background task to clean up the audit logs - launch_pgaudit_gc(log_directory_path); + _ => {} } // Configure and start rsyslog for Postgres logs export - if self.has_feature(ComputeFeature::PostgresLogsExport) { - if let Some(ref project_id) = pspec.spec.cluster.cluster_id { - let host = PostgresLogsRsyslogConfig::default_host(project_id); - let conf = PostgresLogsRsyslogConfig::new(Some(&host)); - configure_postgres_logs_export(conf)?; - } else { - warn!("not configuring rsyslog for Postgres logs export: project ID is missing") - } - } + let conf = PostgresLogsRsyslogConfig::new(pspec.spec.logs_export_host.as_deref()); + configure_postgres_logs_export(conf)?; // Launch remaining service threads let _monitor_handle = launch_monitor(self); @@ -1548,6 +1555,10 @@ impl ComputeNode { }); } + // Reconfigure rsyslog for Postgres logs export + let conf = PostgresLogsRsyslogConfig::new(spec.logs_export_host.as_deref()); + configure_postgres_logs_export(conf)?; + // Write new config let pgdata_path = Path::new(&self.params.pgdata); config::write_postgres_conf( @@ -2032,12 +2043,8 @@ LIMIT 100", let mut download_tasks = Vec::new(); for library in &libs_vec { - let (ext_name, ext_path) = remote_extensions.get_ext( - library, - true, - &self.params.build_tag, - &self.params.pgversion, - )?; + let (ext_name, ext_path) = + remote_extensions.get_ext(library, true, &BUILD_TAG, &self.params.pgversion)?; download_tasks.push(self.download_extension(ext_name, ext_path)); } let results = join_all(download_tasks).await; diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 614ab076ff..71c6123c3b 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -7,7 +7,7 @@ use std::io::prelude::*; use std::path::Path; use compute_api::responses::TlsConfig; -use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, GenericOption}; +use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption}; use crate::pg_helpers::{ GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value, @@ -89,6 +89,15 @@ pub fn write_postgres_conf( escape_conf_value(&s.to_string()) )?; } + if let Some(s) = &spec.project_id { + writeln!(file, "neon.project_id={}", escape_conf_value(s))?; + } + if let Some(s) = &spec.branch_id { + writeln!(file, "neon.branch_id={}", escape_conf_value(s))?; + } + if let Some(s) = &spec.endpoint_id { + writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?; + } // tls if let Some(tls_config) = tls_config { @@ -169,7 +178,7 @@ pub fn write_postgres_conf( // and don't allow the user or the control plane admin to change them. match spec.audit_log_level { ComputeAudit::Disabled => {} - ComputeAudit::Log => { + ComputeAudit::Log | ComputeAudit::Base => { writeln!(file, "# Managed by compute_ctl base audit settings: start")?; writeln!(file, "pgaudit.log='ddl,role'")?; // Disable logging of catalog queries to reduce the noise @@ -193,16 +202,20 @@ pub fn write_postgres_conf( } writeln!(file, "# Managed by compute_ctl base audit settings: end")?; } - ComputeAudit::Hipaa => { + ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { writeln!( file, "# Managed by compute_ctl compliance audit settings: begin" )?; - // This log level is very verbose - // but this is necessary for HIPAA compliance. - // Exclude 'misc' category, because it doesn't contain anythig relevant. - writeln!(file, "pgaudit.log='all, -misc'")?; - writeln!(file, "pgaudit.log_parameter=on")?; + // Enable logging of parameters. + // This is very verbose and may contain sensitive data. + if spec.audit_log_level == ComputeAudit::Full { + writeln!(file, "pgaudit.log_parameter=on")?; + writeln!(file, "pgaudit.log='all'")?; + } else { + writeln!(file, "pgaudit.log_parameter=off")?; + writeln!(file, "pgaudit.log='all, -misc'")?; + } // Disable logging of catalog queries // The catalog doesn't contain sensitive data, so we don't need to audit it. writeln!(file, "pgaudit.log_catalog=off")?; @@ -255,7 +268,7 @@ pub fn write_postgres_conf( // We need Postgres to send logs to rsyslog so that we can forward them // further to customers' log aggregation systems. - if spec.features.contains(&ComputeFeature::PostgresLogsExport) { + if spec.logs_export_host.is_some() { writeln!(file, "log_destination='stderr,syslog'")?; } diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs index 798dd1179b..f221752c38 100644 --- a/compute_tools/src/http/middleware/authorize.rs +++ b/compute_tools/src/http/middleware/authorize.rs @@ -6,20 +6,15 @@ use axum_extra::{ TypedHeader, headers::{Authorization, authorization::Bearer}, }; +use compute_api::requests::ComputeClaims; use futures::future::BoxFuture; use http::{Request, Response, StatusCode}; use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet}; -use serde::Deserialize; use tower_http::auth::AsyncAuthorizeRequest; -use tracing::warn; +use tracing::{debug, warn}; use crate::http::{JsonResponse, extract::RequestId}; -#[derive(Clone, Debug, Deserialize)] -pub(in crate::http) struct Claims { - compute_id: String, -} - #[derive(Clone, Debug)] pub(in crate::http) struct Authorize { compute_id: String, @@ -59,9 +54,12 @@ impl AsyncAuthorizeRequest for Authorize { Box::pin(async move { let request_id = request.extract_parts::().await.unwrap(); - // TODO: Remove this check after a successful rollout - if jwks.keys.is_empty() { - warn!(%request_id, "Authorization has not been configured"); + // TODO: Remove this stanza after teaching neon_local and the + // regression tests to use a JWT + JWKS. + // + // https://github.com/neondatabase/neon/issues/11316 + if cfg!(feature = "testing") { + warn!(%request_id, "Skipping compute_ctl authorization check"); return Ok(request); } @@ -94,7 +92,7 @@ impl AsyncAuthorizeRequest for Authorize { if data.claims.compute_id != compute_id { return Err(JsonResponse::error( StatusCode::UNAUTHORIZED, - "invalid claims in authorization token", + "invalid compute ID in authorization token claims", )); } @@ -109,15 +107,19 @@ impl AsyncAuthorizeRequest for Authorize { impl Authorize { /// Verify the token using the JSON Web Key set and return the token data. - fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result> { - debug_assert!(!jwks.keys.is_empty()); + fn verify( + jwks: &JwkSet, + token: &str, + validation: &Validation, + ) -> Result> { + debug!("verifying token {}", token); for jwk in jwks.keys.iter() { let decoding_key = match DecodingKey::from_jwk(jwk) { Ok(key) => key, Err(e) => { warn!( - "Failed to construct decoding key from {}: {}", + "failed to construct decoding key from {}: {}", jwk.common.key_id.as_ref().unwrap(), e ); @@ -126,11 +128,11 @@ impl Authorize { } }; - match jsonwebtoken::decode::(token, &decoding_key, validation) { + match jsonwebtoken::decode::(token, &decoding_key, validation) { Ok(data) => return Ok(data), Err(e) => { warn!( - "Failed to decode authorization token using {}: {}", + "failed to decode authorization token using {}: {}", jwk.common.key_id.as_ref().unwrap(), e ); @@ -140,6 +142,6 @@ impl Authorize { } } - Err(anyhow!("Failed to verify authorization token")) + Err(anyhow!("failed to verify authorization token")) } } diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 7c8f72440f..bbdb7d0917 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -306,36 +306,6 @@ paths: schema: $ref: "#/components/schemas/GenericError" - /configure_telemetry: - post: - tags: - - Configure - summary: Configure rsyslog - description: | - This API endpoint configures rsyslog to forward Postgres logs - to a specified otel collector. - operationId: configureTelemetry - requestBody: - required: true - content: - application/json: - schema: - type: object - properties: - logs_export_host: - type: string - description: | - Hostname and the port of the otel collector. Leave empty to disable logs forwarding. - Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:54526 - responses: - 204: - description: "Telemetry configured successfully" - 500: - content: - application/json: - schema: - $ref: "#/components/schemas/GenericError" - components: securitySchemes: JWT: diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs index 5c9dd22c3d..f7a19da611 100644 --- a/compute_tools/src/http/routes/configure.rs +++ b/compute_tools/src/http/routes/configure.rs @@ -1,11 +1,9 @@ use std::sync::Arc; -use axum::body::Body; use axum::extract::State; use axum::response::Response; -use compute_api::requests::{ConfigurationRequest, ConfigureTelemetryRequest}; +use compute_api::requests::ConfigurationRequest; use compute_api::responses::{ComputeStatus, ComputeStatusResponse}; -use compute_api::spec::ComputeFeature; use http::StatusCode; use tokio::task; use tracing::info; @@ -13,7 +11,6 @@ use tracing::info; use crate::compute::{ComputeNode, ParsedSpec}; use crate::http::JsonResponse; use crate::http::extract::Json; -use crate::rsyslog::{PostgresLogsRsyslogConfig, configure_postgres_logs_export}; // Accept spec in JSON format and request compute configuration. If anything // goes wrong after we set the compute status to `ConfigurationPending` and @@ -25,13 +22,6 @@ pub(in crate::http) async fn configure( State(compute): State>, request: Json, ) -> Response { - if !compute.params.live_config_allowed { - return JsonResponse::error( - StatusCode::PRECONDITION_FAILED, - "live configuration is not allowed for this compute node".to_string(), - ); - } - let pspec = match ParsedSpec::try_from(request.spec.clone()) { Ok(p) => p, Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e), @@ -95,25 +85,3 @@ pub(in crate::http) async fn configure( JsonResponse::success(StatusCode::OK, body) } - -pub(in crate::http) async fn configure_telemetry( - State(compute): State>, - request: Json, -) -> Response { - if !compute.has_feature(ComputeFeature::PostgresLogsExport) { - return JsonResponse::error( - StatusCode::PRECONDITION_FAILED, - "Postgres logs export feature is not enabled".to_string(), - ); - } - - let conf = PostgresLogsRsyslogConfig::new(request.logs_export_host.as_deref()); - if let Err(err) = configure_postgres_logs_export(conf) { - return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, err.to_string()); - } - - Response::builder() - .status(StatusCode::NO_CONTENT) - .body(Body::from("")) - .unwrap() -} diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs index 563b73ae65..6508de6eee 100644 --- a/compute_tools/src/http/routes/extension_server.rs +++ b/compute_tools/src/http/routes/extension_server.rs @@ -5,7 +5,7 @@ use axum::response::{IntoResponse, Response}; use http::StatusCode; use serde::Deserialize; -use crate::compute::ComputeNode; +use crate::compute::{BUILD_TAG, ComputeNode}; use crate::http::JsonResponse; use crate::http::extract::{Path, Query}; @@ -47,7 +47,7 @@ pub(in crate::http) async fn download_extension( remote_extensions.get_ext( &filename, ext_server_params.is_library, - &compute.params.build_tag, + &BUILD_TAG, &compute.params.pgversion, ) }; diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 179369e3ef..10f767e97c 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -87,7 +87,6 @@ impl From<&Server> for Router> { let authenticated_router = Router::>::new() .route("/check_writability", post(check_writability::is_writable)) .route("/configure", post(configure::configure)) - .route("/configure_telemetry", post(configure::configure_telemetry)) .route("/database_schema", get(database_schema::get_schema_dump)) .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) .route("/insights", get(insights::get_insights)) diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index 4caa48307e..fa00476fd2 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -1,7 +1,8 @@ use metrics::core::{AtomicF64, Collector, GenericGauge}; use metrics::proto::MetricFamily; use metrics::{ - IntCounterVec, UIntGaugeVec, register_gauge, register_int_counter_vec, register_uint_gauge_vec, + IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter_vec, + register_int_gauge_vec, register_uint_gauge_vec, }; use once_cell::sync::Lazy; @@ -18,13 +19,13 @@ pub(crate) static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { // but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec. // And it's fair to call it a 'RPC' (Remote Procedure Call). pub enum CPlaneRequestRPC { - GetSpec, + GetConfig, } impl CPlaneRequestRPC { pub fn as_str(&self) -> &str { match self { - CPlaneRequestRPC::GetSpec => "GetSpec", + CPlaneRequestRPC::GetConfig => "GetConfig", } } } @@ -70,8 +71,19 @@ pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy> = Lazy::new( .expect("failed to define a metric") }); +// Report that `compute_ctl` is up and what's the current compute status. +pub(crate) static COMPUTE_CTL_UP: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "compute_ctl_up", + "Whether compute_ctl is running", + &["build_tag", "status"] + ) + .expect("failed to define a metric") +}); + pub fn collect() -> Vec { - let mut metrics = INSTALLED_EXTENSIONS.collect(); + let mut metrics = COMPUTE_CTL_UP.collect(); + metrics.extend(INSTALLED_EXTENSIONS.collect()); metrics.extend(CPLANE_REQUESTS_TOTAL.collect()); metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect()); metrics.extend(DB_MIGRATION_FAILED.collect()); diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs index 80594db3f1..ba08302df2 100644 --- a/compute_tools/src/rsyslog.rs +++ b/compute_tools/src/rsyslog.rs @@ -119,16 +119,9 @@ impl<'a> PostgresLogsRsyslogConfig<'a> { }; Ok(config_content) } - - /// Returns the default host for otel collector that receives Postgres logs - pub fn default_host(project_id: &str) -> String { - format!( - "config-{}-collector.neon-telemetry.svc.cluster.local:10514", - project_id - ) - } } +/// Writes rsyslogd configuration for Postgres logs export and restarts rsyslog. pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result<()> { let new_config = conf.build()?; let current_config = PostgresLogsRsyslogConfig::current_config()?; @@ -261,16 +254,5 @@ mod tests { let res = conf.build(); assert!(res.is_err()); } - - { - // Verify config with default host - let host = PostgresLogsRsyslogConfig::default_host("shy-breeze-123"); - let conf = PostgresLogsRsyslogConfig::new(Some(&host)); - let res = conf.build(); - assert!(res.is_ok()); - let conf_str = res.unwrap(); - assert!(conf_str.contains(r#"shy-breeze-123"#)); - assert!(conf_str.contains(r#"port="10514""#)); - } } } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index a76af21e9f..4b38e6e29c 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -3,9 +3,8 @@ use std::path::Path; use anyhow::{Result, anyhow, bail}; use compute_api::responses::{ - ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse, + ComputeConfig, ControlPlaneComputeStatus, ControlPlaneConfigResponse, }; -use compute_api::spec::ComputeSpec; use reqwest::StatusCode; use tokio_postgres::Client; use tracing::{error, info, instrument}; @@ -21,7 +20,7 @@ use crate::params::PG_HBA_ALL_MD5; fn do_control_plane_request( uri: &str, jwt: &str, -) -> Result { +) -> Result { let resp = reqwest::blocking::Client::new() .get(uri) .header("Authorization", format!("Bearer {}", jwt)) @@ -29,14 +28,14 @@ fn do_control_plane_request( .map_err(|e| { ( true, - format!("could not perform spec request to control plane: {:?}", e), + format!("could not perform request to control plane: {:?}", e), UNKNOWN_HTTP_STATUS.to_string(), ) })?; let status = resp.status(); match status { - StatusCode::OK => match resp.json::() { + StatusCode::OK => match resp.json::() { Ok(spec_resp) => Ok(spec_resp), Err(e) => Err(( true, @@ -69,40 +68,35 @@ fn do_control_plane_request( } } -/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN` -/// env variable is set, it will be used for authorization. -pub fn get_spec_from_control_plane( - base_uri: &str, - compute_id: &str, -) -> Result<(Option, ComputeCtlConfig)> { +/// Request config from the control-plane by compute_id. If +/// `NEON_CONTROL_PLANE_TOKEN` env variable is set, it will be used for +/// authorization. +pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result { let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec"); - let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") { - Ok(v) => v, - Err(_) => "".to_string(), - }; + let jwt: String = std::env::var("NEON_CONTROL_PLANE_TOKEN").unwrap_or_default(); let mut attempt = 1; - info!("getting spec from control plane: {}", cp_uri); + info!("getting config from control plane: {}", cp_uri); // Do 3 attempts to get spec from the control plane using the following logic: // - network error -> then retry // - compute id is unknown or any other error -> bail out // - no spec for compute yet (Empty state) -> return Ok(None) - // - got spec -> return Ok(Some(spec)) + // - got config -> return Ok(Some(config)) while attempt < 4 { let result = match do_control_plane_request(&cp_uri, &jwt) { - Ok(spec_resp) => { + Ok(config_resp) => { CPLANE_REQUESTS_TOTAL .with_label_values(&[ - CPlaneRequestRPC::GetSpec.as_str(), + CPlaneRequestRPC::GetConfig.as_str(), &StatusCode::OK.to_string(), ]) .inc(); - match spec_resp.status { - ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)), + match config_resp.status { + ControlPlaneComputeStatus::Empty => Ok(config_resp.into()), ControlPlaneComputeStatus::Attached => { - if let Some(spec) = spec_resp.spec { - Ok((Some(spec), spec_resp.compute_ctl_config)) + if config_resp.spec.is_some() { + Ok(config_resp.into()) } else { bail!("compute is attached, but spec is empty") } @@ -111,7 +105,7 @@ pub fn get_spec_from_control_plane( } Err((retry, msg, status)) => { CPLANE_REQUESTS_TOTAL - .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status]) + .with_label_values(&[CPlaneRequestRPC::GetConfig.as_str(), &status]) .inc(); if retry { Err(anyhow!(msg)) @@ -122,7 +116,7 @@ pub fn get_spec_from_control_plane( }; if let Err(e) = &result { - error!("attempt {} to get spec failed with: {}", attempt, e); + error!("attempt {} to get config failed with: {}", attempt, e); } else { return result; } @@ -133,13 +127,13 @@ pub fn get_spec_from_control_plane( // All attempts failed, return error. Err(anyhow::anyhow!( - "Exhausted all attempts to retrieve the spec from the control plane" + "Exhausted all attempts to retrieve the config from the control plane" )) } /// Check `pg_hba.conf` and update if needed to allow external connections. pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { - // XXX: consider making it a part of spec.json + // XXX: consider making it a part of config.json let pghba_path = pgdata_path.join("pg_hba.conf"); if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? { @@ -153,7 +147,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { /// Create a standby.signal file pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { - // XXX: consider making it a part of spec.json + // XXX: consider making it a part of config.json let signalfile = pgdata_path.join("standby.signal"); if !signalfile.exists() { diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 2be6458fb4..0d1389dbad 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -278,12 +278,12 @@ impl ComputeNode { // so that all config operations are audit logged. match spec.audit_log_level { - ComputeAudit::Hipaa => { + ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { phases.push(CreatePgauditExtension); phases.push(CreatePgauditlogtofileExtension); phases.push(DisablePostgresDBPgAudit); } - ComputeAudit::Log => { + ComputeAudit::Log | ComputeAudit::Base => { phases.push(CreatePgauditExtension); phases.push(DisablePostgresDBPgAudit); } @@ -419,7 +419,7 @@ impl ComputeNode { .iter() .filter_map(|val| val.parse::().ok()) .map(|val| if val > 1 { val - 1 } else { 1 }) - .last() + .next_back() .unwrap_or(3) } } diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 747268f80b..db9715dc62 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -20,8 +20,10 @@ use compute_api::spec::ComputeMode; use control_plane::endpoint::ComputeControlPlane; use control_plane::local_env::{ InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf, - SafekeeperConf, + ObjectStorageConf, SafekeeperConf, }; +use control_plane::object_storage::OBJECT_STORAGE_DEFAULT_PORT; +use control_plane::object_storage::ObjectStorage; use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage_controller::{ @@ -39,7 +41,7 @@ use pageserver_api::controller_api::{ use pageserver_api::models::{ ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo, }; -use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; +use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; use safekeeper_api::membership::SafekeeperGeneration; @@ -91,6 +93,8 @@ enum NeonLocalCmd { #[command(subcommand)] Safekeeper(SafekeeperCmd), #[command(subcommand)] + ObjectStorage(ObjectStorageCmd), + #[command(subcommand)] Endpoint(EndpointCmd), #[command(subcommand)] Mappings(MappingsCmd), @@ -454,6 +458,32 @@ enum SafekeeperCmd { Restart(SafekeeperRestartCmdArgs), } +#[derive(clap::Subcommand)] +#[clap(about = "Manage object storage")] +enum ObjectStorageCmd { + Start(ObjectStorageStartCmd), + Stop(ObjectStorageStopCmd), +} + +#[derive(clap::Args)] +#[clap(about = "Start object storage")] +struct ObjectStorageStartCmd { + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, +} + +#[derive(clap::Args)] +#[clap(about = "Stop object storage")] +struct ObjectStorageStopCmd { + #[arg(value_enum, default_value = "fast")] + #[clap( + short = 'm', + help = "If 'immediate', don't flush repository data at shutdown" + )] + stop_mode: StopMode, +} + #[derive(clap::Args)] #[clap(about = "Start local safekeeper")] struct SafekeeperStartCmdArgs { @@ -759,6 +789,7 @@ fn main() -> Result<()> { } NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)), NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)), + NeonLocalCmd::ObjectStorage(subcmd) => rt.block_on(handle_object_storage(&subcmd, env)), NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)), NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env), }; @@ -975,6 +1006,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { } }) .collect(), + object_storage: ObjectStorageConf { + port: OBJECT_STORAGE_DEFAULT_PORT, + }, pg_distrib_dir: None, neon_distrib_dir: None, default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)), @@ -1083,7 +1117,7 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any stripe_size: args .shard_stripe_size .map(ShardStripeSize) - .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE), + .unwrap_or(DEFAULT_STRIPE_SIZE), }, placement_policy: args.placement_policy.clone(), config: tenant_conf, @@ -1396,7 +1430,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res vec![(parsed.0, parsed.1.unwrap_or(5432))], // If caller is telling us what pageserver to use, this is not a tenant which is // full managed by storage controller, therefore not sharded. - ShardParameters::DEFAULT_STRIPE_SIZE, + DEFAULT_STRIPE_SIZE, ) } else { // Look up the currently attached location of the tenant, and its striping metadata, @@ -1683,6 +1717,41 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) -> Ok(()) } +async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::LocalEnv) -> Result<()> { + use ObjectStorageCmd::*; + let storage = ObjectStorage::from_env(env); + + // In tests like test_forward_compatibility or test_graceful_cluster_restart + // old neon binaries (without object_storage) are present + if !storage.bin.exists() { + eprintln!( + "{} binary not found. Ignore if this is a compatibility test", + storage.bin + ); + return Ok(()); + } + + match subcmd { + Start(ObjectStorageStartCmd { start_timeout }) => { + if let Err(e) = storage.start(start_timeout).await { + eprintln!("object_storage start failed: {e}"); + exit(1); + } + } + Stop(ObjectStorageStopCmd { stop_mode }) => { + let immediate = match stop_mode { + StopMode::Fast => false, + StopMode::Immediate => true, + }; + if let Err(e) = storage.stop(immediate) { + eprintln!("proxy stop failed: {e}"); + exit(1); + } + } + }; + Ok(()) +} + async fn handle_storage_broker(subcmd: &StorageBrokerCmd, env: &local_env::LocalEnv) -> Result<()> { match subcmd { StorageBrokerCmd::Start(args) => { @@ -1777,6 +1846,13 @@ async fn handle_start_all_impl( .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id))) }); } + + js.spawn(async move { + ObjectStorage::from_env(env) + .start(&retry_timeout) + .await + .map_err(|e| e.context("start object_storage")) + }); })(); let mut errors = Vec::new(); @@ -1874,6 +1950,11 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { } } + let storage = ObjectStorage::from_env(env); + if let Err(e) = storage.stop(immediate) { + eprintln!("object_storage stop failed: {:#}", e); + } + for ps_conf in &env.pageservers { let pageserver = PageServerNode::from_env(env, ps_conf); if let Err(e) = pageserver.stop(immediate) { diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index b46d616827..2fa7a62f8f 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -29,7 +29,7 @@ //! compute.log - log output of `compute_ctl` and `postgres` //! endpoint.json - serialized `EndpointConf` struct //! postgresql.conf - postgresql settings -//! spec.json - passed to `compute_ctl` +//! config.json - passed to `compute_ctl` //! pgdata/ //! postgresql.conf - copy of postgresql.conf created by `compute_ctl` //! zenith.signal @@ -46,7 +46,9 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; use anyhow::{Context, Result, anyhow, bail}; use compute_api::requests::ConfigurationRequest; -use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse}; +use compute_api::responses::{ + ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, +}; use compute_api::spec::{ Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, RemoteExtSpec, Role, @@ -619,86 +621,101 @@ impl Endpoint { remote_extensions = None; }; - // Create spec file - let mut spec = ComputeSpec { - skip_pg_catalog_updates: self.skip_pg_catalog_updates, - format_version: 1.0, - operation_uuid: None, - features: self.features.clone(), - swap_size_bytes: None, - disk_quota_bytes: None, - disable_lfc_resizing: None, - cluster: Cluster { - cluster_id: None, // project ID: not used - name: None, // project name: not used - state: None, - roles: if create_test_user { - vec![Role { + // Create config file + let config = { + let mut spec = ComputeSpec { + skip_pg_catalog_updates: self.skip_pg_catalog_updates, + format_version: 1.0, + operation_uuid: None, + features: self.features.clone(), + swap_size_bytes: None, + disk_quota_bytes: None, + disable_lfc_resizing: None, + cluster: Cluster { + cluster_id: None, // project ID: not used + name: None, // project name: not used + state: None, + roles: if create_test_user { + vec![Role { + name: PgIdent::from_str("test").unwrap(), + encrypted_password: None, + options: None, + }] + } else { + Vec::new() + }, + databases: if create_test_user { + vec![Database { + name: PgIdent::from_str("neondb").unwrap(), + owner: PgIdent::from_str("test").unwrap(), + options: None, + restrict_conn: false, + invalid: false, + }] + } else { + Vec::new() + }, + settings: None, + postgresql_conf: Some(postgresql_conf.clone()), + }, + delta_operations: None, + tenant_id: Some(self.tenant_id), + timeline_id: Some(self.timeline_id), + project_id: None, + branch_id: None, + endpoint_id: Some(self.endpoint_id.clone()), + mode: self.mode, + pageserver_connstring: Some(pageserver_connstring), + safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()), + safekeeper_connstrings, + storage_auth_token: auth_token.clone(), + remote_extensions, + pgbouncer_settings: None, + shard_stripe_size: Some(shard_stripe_size), + local_proxy_config: None, + reconfigure_concurrency: self.reconfigure_concurrency, + drop_subscriptions_before_start: self.drop_subscriptions_before_start, + audit_log_level: ComputeAudit::Disabled, + logs_export_host: None::, + }; + + // this strange code is needed to support respec() in tests + if self.cluster.is_some() { + debug!("Cluster is already set in the endpoint spec, using it"); + spec.cluster = self.cluster.clone().unwrap(); + + debug!("spec.cluster {:?}", spec.cluster); + + // fill missing fields again + if create_test_user { + spec.cluster.roles.push(Role { name: PgIdent::from_str("test").unwrap(), encrypted_password: None, options: None, - }] - } else { - Vec::new() - }, - databases: if create_test_user { - vec![Database { + }); + spec.cluster.databases.push(Database { name: PgIdent::from_str("neondb").unwrap(), owner: PgIdent::from_str("test").unwrap(), options: None, restrict_conn: false, invalid: false, - }] - } else { - Vec::new() - }, - settings: None, - postgresql_conf: Some(postgresql_conf.clone()), - }, - delta_operations: None, - tenant_id: Some(self.tenant_id), - timeline_id: Some(self.timeline_id), - mode: self.mode, - pageserver_connstring: Some(pageserver_connstring), - safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()), - safekeeper_connstrings, - storage_auth_token: auth_token.clone(), - remote_extensions, - pgbouncer_settings: None, - shard_stripe_size: Some(shard_stripe_size), - local_proxy_config: None, - reconfigure_concurrency: self.reconfigure_concurrency, - drop_subscriptions_before_start: self.drop_subscriptions_before_start, - audit_log_level: ComputeAudit::Disabled, + }); + } + spec.cluster.postgresql_conf = Some(postgresql_conf); + } + + ComputeConfig { + spec: Some(spec), + compute_ctl_config: ComputeCtlConfig::default(), + } }; - // this strange code is needed to support respec() in tests - if self.cluster.is_some() { - debug!("Cluster is already set in the endpoint spec, using it"); - spec.cluster = self.cluster.clone().unwrap(); - - debug!("spec.cluster {:?}", spec.cluster); - - // fill missing fields again - if create_test_user { - spec.cluster.roles.push(Role { - name: PgIdent::from_str("test").unwrap(), - encrypted_password: None, - options: None, - }); - spec.cluster.databases.push(Database { - name: PgIdent::from_str("neondb").unwrap(), - owner: PgIdent::from_str("test").unwrap(), - options: None, - restrict_conn: false, - invalid: false, - }); - } - spec.cluster.postgresql_conf = Some(postgresql_conf); - } - + // TODO(tristan957): Remove the write to spec.json after compatibility + // tests work themselves out let spec_path = self.endpoint_path().join("spec.json"); - std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; + std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?; + let config_path = self.endpoint_path().join("config.json"); + std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?; // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it. let logfile = std::fs::OpenOptions::new() @@ -706,6 +723,16 @@ impl Endpoint { .append(true) .open(self.endpoint_path().join("compute.log"))?; + // TODO(tristan957): Remove when compatibility tests are no longer an + // issue + let old_compute_ctl = { + let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); + let help_output = cmd.arg("--help").output()?; + let help_output = String::from_utf8_lossy(&help_output.stdout); + + !help_output.contains("--config") + }; + // Launch compute_ctl let conn_str = self.connstr("cloud_admin", "postgres"); println!("Starting postgres node at '{}'", conn_str); @@ -724,9 +751,18 @@ impl Endpoint { ]) .args(["--pgdata", self.pgdata().to_str().unwrap()]) .args(["--connstr", &conn_str]) + // TODO(tristan957): Change this to --config when compatibility tests + // are no longer an issue .args([ "--spec-path", - self.endpoint_path().join("spec.json").to_str().unwrap(), + self.endpoint_path() + .join(if old_compute_ctl { + "spec.json" + } else { + "config.json" + }) + .to_str() + .unwrap(), ]) .args([ "--pgbin", @@ -869,10 +905,12 @@ impl Endpoint { stripe_size: Option, safekeepers: Option>, ) -> Result<()> { - let mut spec: ComputeSpec = { - let spec_path = self.endpoint_path().join("spec.json"); - let file = std::fs::File::open(spec_path)?; - serde_json::from_reader(file)? + let (mut spec, compute_ctl_config) = { + let config_path = self.endpoint_path().join("config.json"); + let file = std::fs::File::open(config_path)?; + let config: ComputeConfig = serde_json::from_reader(file)?; + + (config.spec.unwrap(), config.compute_ctl_config) }; let postgresql_conf = self.read_postgresql_conf()?; @@ -922,7 +960,7 @@ impl Endpoint { .body( serde_json::to_string(&ConfigurationRequest { spec, - compute_ctl_config: ComputeCtlConfig::default(), + compute_ctl_config, }) .unwrap(), ) diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index 2af272f388..2d9fe2c807 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -10,6 +10,7 @@ mod background_process; pub mod broker; pub mod endpoint; pub mod local_env; +pub mod object_storage; pub mod pageserver; pub mod postgresql_conf; pub mod safekeeper; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 3f3794c0ee..fa10abe91a 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -15,9 +15,10 @@ use clap::ValueEnum; use postgres_backend::AuthType; use reqwest::Url; use serde::{Deserialize, Serialize}; -use utils::auth::{Claims, encode_from_key_file}; +use utils::auth::encode_from_key_file; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; +use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage}; use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode}; use crate::safekeeper::SafekeeperNode; @@ -55,6 +56,7 @@ pub struct LocalEnv { // used to issue tokens during e.g pg start pub private_key_path: PathBuf, + pub public_key_path: PathBuf, pub broker: NeonBroker, @@ -68,6 +70,8 @@ pub struct LocalEnv { pub safekeepers: Vec, + pub object_storage: ObjectStorageConf, + // Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will // be propagated into each pageserver's configuration. pub control_plane_api: Url, @@ -95,6 +99,7 @@ pub struct OnDiskConfig { pub neon_distrib_dir: PathBuf, pub default_tenant_id: Option, pub private_key_path: PathBuf, + pub public_key_path: PathBuf, pub broker: NeonBroker, pub storage_controller: NeonStorageControllerConf, #[serde( @@ -103,6 +108,7 @@ pub struct OnDiskConfig { )] pub pageservers: Vec, pub safekeepers: Vec, + pub object_storage: ObjectStorageConf, pub control_plane_api: Option, pub control_plane_hooks_api: Option, pub control_plane_compute_hook_api: Option, @@ -136,11 +142,18 @@ pub struct NeonLocalInitConf { pub storage_controller: Option, pub pageservers: Vec, pub safekeepers: Vec, + pub object_storage: ObjectStorageConf, pub control_plane_api: Option, pub control_plane_hooks_api: Option, pub generate_local_ssl_certs: bool, } +#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)] +#[serde(default)] +pub struct ObjectStorageConf { + pub port: u16, +} + /// Broker config for cluster internal communication. #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] @@ -398,6 +411,10 @@ impl LocalEnv { self.pg_dir(pg_version, "lib") } + pub fn object_storage_bin(&self) -> PathBuf { + self.neon_distrib_dir.join("object_storage") + } + pub fn pageserver_bin(&self) -> PathBuf { self.neon_distrib_dir.join("pageserver") } @@ -431,6 +448,10 @@ impl LocalEnv { self.base_data_dir.join("safekeepers").join(data_dir_name) } + pub fn object_storage_data_dir(&self) -> PathBuf { + self.base_data_dir.join("object_storage") + } + pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> { if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) { Ok(conf) @@ -582,6 +603,7 @@ impl LocalEnv { neon_distrib_dir, default_tenant_id, private_key_path, + public_key_path, broker, storage_controller, pageservers, @@ -591,6 +613,7 @@ impl LocalEnv { control_plane_compute_hook_api: _, branch_name_mappings, generate_local_ssl_certs, + object_storage, } = on_disk_config; LocalEnv { base_data_dir: repopath.to_owned(), @@ -598,6 +621,7 @@ impl LocalEnv { neon_distrib_dir, default_tenant_id, private_key_path, + public_key_path, broker, storage_controller, pageservers, @@ -606,6 +630,7 @@ impl LocalEnv { control_plane_hooks_api, branch_name_mappings, generate_local_ssl_certs, + object_storage, } }; @@ -705,6 +730,7 @@ impl LocalEnv { neon_distrib_dir: self.neon_distrib_dir.clone(), default_tenant_id: self.default_tenant_id, private_key_path: self.private_key_path.clone(), + public_key_path: self.public_key_path.clone(), broker: self.broker.clone(), storage_controller: self.storage_controller.clone(), pageservers: vec![], // it's skip_serializing anyway @@ -714,6 +740,7 @@ impl LocalEnv { control_plane_compute_hook_api: None, branch_name_mappings: self.branch_name_mappings.clone(), generate_local_ssl_certs: self.generate_local_ssl_certs, + object_storage: self.object_storage.clone(), }, ) } @@ -730,7 +757,7 @@ impl LocalEnv { } // this function is used only for testing purposes in CLI e g generate tokens during init - pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result { + pub fn generate_auth_token(&self, claims: &S) -> anyhow::Result { let private_key_path = self.get_private_key_path(); let key_data = fs::read(private_key_path)?; encode_from_key_file(claims, &key_data) @@ -797,6 +824,7 @@ impl LocalEnv { control_plane_api, generate_local_ssl_certs, control_plane_hooks_api, + object_storage, } = conf; // Find postgres binaries. @@ -828,6 +856,7 @@ impl LocalEnv { ) .context("generate auth keys")?; let private_key_path = PathBuf::from("auth_private_key.pem"); + let public_key_path = PathBuf::from("auth_public_key.pem"); // create the runtime type because the remaining initialization code below needs // a LocalEnv instance op operation @@ -838,6 +867,7 @@ impl LocalEnv { neon_distrib_dir, default_tenant_id: Some(default_tenant_id), private_key_path, + public_key_path, broker, storage_controller: storage_controller.unwrap_or_default(), pageservers: pageservers.iter().map(Into::into).collect(), @@ -846,6 +876,7 @@ impl LocalEnv { control_plane_hooks_api, branch_name_mappings: Default::default(), generate_local_ssl_certs, + object_storage, }; if generate_local_ssl_certs { @@ -873,8 +904,13 @@ impl LocalEnv { .context("pageserver init failed")?; } + ObjectStorage::from_env(&env) + .init() + .context("object storage init failed")?; + // setup remote remote location for default LocalFs remote storage std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; + std::fs::create_dir_all(env.base_data_dir.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR))?; env.persist_config() } @@ -944,7 +980,7 @@ fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()> // -out rootCA.crt -keyout rootCA.key let keygen_output = Command::new("openssl") .args([ - "req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500", + "req", "-x509", "-newkey", "ed25519", "-nodes", "-days", "36500", ]) .args(["-subj", "/CN=Neon Local CA"]) .args(["-out", cert_path.to_str().unwrap()]) @@ -974,7 +1010,7 @@ fn generate_ssl_cert( // -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" let keygen_output = Command::new("openssl") .args(["req", "-new", "-nodes"]) - .args(["-newkey", "rsa:2048"]) + .args(["-newkey", "ed25519"]) .args(["-subj", "/CN=localhost"]) .args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"]) .args(["-keyout", key_path.to_str().unwrap()]) diff --git a/control_plane/src/object_storage.rs b/control_plane/src/object_storage.rs new file mode 100644 index 0000000000..1a595b7809 --- /dev/null +++ b/control_plane/src/object_storage.rs @@ -0,0 +1,107 @@ +use crate::background_process::{self, start_process, stop_process}; +use crate::local_env::LocalEnv; +use anyhow::anyhow; +use anyhow::{Context, Result}; +use camino::Utf8PathBuf; +use std::io::Write; +use std::time::Duration; + +/// Directory within .neon which will be used by default for LocalFs remote storage. +pub const OBJECT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/object_storage"; +pub const OBJECT_STORAGE_DEFAULT_PORT: u16 = 9993; + +pub struct ObjectStorage { + pub bin: Utf8PathBuf, + pub data_dir: Utf8PathBuf, + pub pemfile: Utf8PathBuf, + pub port: u16, +} + +impl ObjectStorage { + pub fn from_env(env: &LocalEnv) -> ObjectStorage { + ObjectStorage { + bin: Utf8PathBuf::from_path_buf(env.object_storage_bin()).unwrap(), + data_dir: Utf8PathBuf::from_path_buf(env.object_storage_data_dir()).unwrap(), + pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(), + port: env.object_storage.port, + } + } + + fn config_path(&self) -> Utf8PathBuf { + self.data_dir.join("object_storage.json") + } + + fn listen_addr(&self) -> Utf8PathBuf { + format!("127.0.0.1:{}", self.port).into() + } + + pub fn init(&self) -> Result<()> { + println!("Initializing object storage in {:?}", self.data_dir); + let parent = self.data_dir.parent().unwrap(); + + #[derive(serde::Serialize)] + struct Cfg { + listen: Utf8PathBuf, + pemfile: Utf8PathBuf, + local_path: Utf8PathBuf, + r#type: String, + } + let cfg = Cfg { + listen: self.listen_addr(), + pemfile: parent.join(self.pemfile.clone()), + local_path: parent.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR), + r#type: "LocalFs".to_string(), + }; + std::fs::create_dir_all(self.config_path().parent().unwrap())?; + std::fs::write(self.config_path(), serde_json::to_string(&cfg)?) + .context("write object storage config")?; + Ok(()) + } + + pub async fn start(&self, retry_timeout: &Duration) -> Result<()> { + println!("Starting s3 proxy at {}", self.listen_addr()); + std::io::stdout().flush().context("flush stdout")?; + + let process_status_check = || async { + tokio::time::sleep(Duration::from_millis(500)).await; + let res = reqwest::Client::new() + .get(format!("http://{}/metrics", self.listen_addr())) + .send() + .await; + match res { + Ok(response) if response.status().is_success() => Ok(true), + Ok(_) => Err(anyhow!("Failed to query /metrics")), + Err(e) => Err(anyhow!("Failed to check node status: {e}")), + } + }; + + let res = start_process( + "object_storage", + &self.data_dir.clone().into_std_path_buf(), + &self.bin.clone().into_std_path_buf(), + vec![self.config_path().to_string()], + vec![("RUST_LOG".into(), "debug".into())], + background_process::InitialPidFile::Create(self.pid_file()), + retry_timeout, + process_status_check, + ) + .await; + if res.is_err() { + eprintln!("Logs:\n{}", std::fs::read_to_string(self.log_file())?); + } + + res + } + + pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { + stop_process(immediate, "object_storage", &self.pid_file()) + } + + fn log_file(&self) -> Utf8PathBuf { + self.data_dir.join("object_storage.log") + } + + fn pid_file(&self) -> Utf8PathBuf { + self.data_dir.join("object_storage.pid") + } +} diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index b39acbca4d..5c985e6dc8 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -535,6 +535,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_compaction_enabled' as bool")?, + gc_compaction_verification: settings + .remove("gc_compaction_verification") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_compaction_verification' as bool")?, gc_compaction_initial_threshold_kb: settings .remove("gc_compaction_initial_threshold_kb") .map(|x| x.parse::()) @@ -545,6 +550,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_compaction_ratio_percent' as integer")?, + sampling_ratio: settings + .remove("sampling_ratio") + .map(serde_json::from_str) + .transpose() + .context("Falied to parse 'sampling_ratio'")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 8000576e87..a4b56ae5c0 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -13,7 +13,9 @@ use pageserver_api::controller_api::{ NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse, TenantLocateResponse, }; -use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo}; +use pageserver_api::models::{ + TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo, +}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; @@ -82,7 +84,8 @@ impl NeonStorageControllerStopArgs { pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, pub node_id: Option, - pub generation_override: Option, + pub generation_override: Option, // only new tenants + pub config: Option, // only new tenants } #[derive(Serialize, Deserialize)] @@ -805,6 +808,7 @@ impl StorageController { tenant_shard_id, node_id: Some(pageserver_id), generation_override: None, + config: None, }; let response = self diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index c503697acc..19c686dcfd 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -385,8 +385,6 @@ where async fn main() -> anyhow::Result<()> { let cli = Cli::parse(); - let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone()); - let ssl_ca_certs = match &cli.ssl_ca_file { Some(ssl_ca_file) => { let buf = tokio::fs::read(ssl_ca_file).await?; @@ -401,9 +399,11 @@ async fn main() -> anyhow::Result<()> { } let http_client = http_client.build()?; + let storcon_client = Client::new(http_client.clone(), cli.api.clone(), cli.jwt.clone()); + let mut trimmed = cli.api.to_string(); trimmed.pop(); - let vps_client = mgmt_api::Client::new(http_client, trimmed, cli.jwt.as_deref()); + let vps_client = mgmt_api::Client::new(http_client.clone(), trimmed, cli.jwt.as_deref()); match cli.command { Command::NodeRegister { @@ -941,7 +941,7 @@ async fn main() -> anyhow::Result<()> { let mut node_to_fill_descs = Vec::new(); for desc in node_descs { - let to_drain = nodes.iter().any(|id| *id == desc.id); + let to_drain = nodes.contains(&desc.id); if to_drain { node_to_drain_descs.push(desc); } else { @@ -1056,7 +1056,7 @@ async fn main() -> anyhow::Result<()> { const DEFAULT_MIGRATE_CONCURRENCY: usize = 8; let mut stream = futures::stream::iter(moves) .map(|mv| { - let client = Client::new(cli.api.clone(), cli.jwt.clone()); + let client = Client::new(http_client.clone(), cli.api.clone(), cli.jwt.clone()); async move { client .dispatch::( diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index 418aaf876d..9409e9d055 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -11,8 +11,8 @@ generate_id() { PG_VERSION=${PG_VERSION:-14} -SPEC_FILE_ORG=/var/db/postgres/specs/spec.json -SPEC_FILE=/tmp/spec.json +CONFIG_FILE_ORG=/var/db/postgres/configs/config.json +CONFIG_FILE=/tmp/config.json echo "Waiting pageserver become ready." while ! nc -z pageserver 6400; do @@ -20,7 +20,7 @@ while ! nc -z pageserver 6400; do done echo "Page server is ready." -cp ${SPEC_FILE_ORG} ${SPEC_FILE} +cp ${CONFIG_FILE_ORG} ${CONFIG_FILE} if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then tenant_id=${TENANT_ID} @@ -73,17 +73,27 @@ else ulid_extension=ulid fi echo "Adding pgx_ulid" -shared_libraries=$(jq -r '.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${SPEC_FILE}) -sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${SPEC_FILE} +shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE}) +sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE} echo "Overwrite tenant id and timeline id in spec file" -sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE} -sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE} +sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE} +sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE} -cat ${SPEC_FILE} +cat ${CONFIG_FILE} + +# TODO(tristan957): Remove these workarounds for backwards compatibility after +# the next compute release. That includes these next few lines and the +# --spec-path in the compute_ctl invocation. +if compute_ctl --help | grep --quiet -- '--config'; then + SPEC_PATH="$CONFIG_FILE" +else + jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json + SPEC_PATH=/tmp/spec.json +fi echo "Start compute node" /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \ -C "postgresql://cloud_admin@localhost:55433/postgres" \ -b /usr/local/bin/postgres \ --compute-id "compute-$RANDOM" \ - -S ${SPEC_FILE} + --spec-path "$SPEC_PATH" diff --git a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json new file mode 100644 index 0000000000..3ddf96512a --- /dev/null +++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json @@ -0,0 +1,148 @@ +{ + "spec": { + "format_version": 1.0, + + "timestamp": "2022-10-12T18:00:00.000Z", + "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c", + + "cluster": { + "cluster_id": "docker_compose", + "name": "docker_compose_test", + "state": "restarted", + "roles": [ + { + "name": "cloud_admin", + "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8", + "options": null + } + ], + "databases": [ + ], + "settings": [ + { + "name": "fsync", + "value": "off", + "vartype": "bool" + }, + { + "name": "wal_level", + "value": "logical", + "vartype": "enum" + }, + { + "name": "wal_log_hints", + "value": "on", + "vartype": "bool" + }, + { + "name": "log_connections", + "value": "on", + "vartype": "bool" + }, + { + "name": "port", + "value": "55433", + "vartype": "integer" + }, + { + "name": "shared_buffers", + "value": "1MB", + "vartype": "string" + }, + { + "name": "max_connections", + "value": "100", + "vartype": "integer" + }, + { + "name": "listen_addresses", + "value": "0.0.0.0", + "vartype": "string" + }, + { + "name": "max_wal_senders", + "value": "10", + "vartype": "integer" + }, + { + "name": "max_replication_slots", + "value": "10", + "vartype": "integer" + }, + { + "name": "wal_sender_timeout", + "value": "5s", + "vartype": "string" + }, + { + "name": "wal_keep_size", + "value": "0", + "vartype": "integer" + }, + { + "name": "password_encryption", + "value": "md5", + "vartype": "enum" + }, + { + "name": "restart_after_crash", + "value": "off", + "vartype": "bool" + }, + { + "name": "synchronous_standby_names", + "value": "walproposer", + "vartype": "string" + }, + { + "name": "shared_preload_libraries", + "value": "neon,pg_cron,timescaledb,pg_stat_statements", + "vartype": "string" + }, + { + "name": "neon.safekeepers", + "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454", + "vartype": "string" + }, + { + "name": "neon.timeline_id", + "value": "TIMELINE_ID", + "vartype": "string" + }, + { + "name": "neon.tenant_id", + "value": "TENANT_ID", + "vartype": "string" + }, + { + "name": "neon.pageserver_connstring", + "value": "host=pageserver port=6400", + "vartype": "string" + }, + { + "name": "max_replication_write_lag", + "value": "500MB", + "vartype": "string" + }, + { + "name": "max_replication_flush_lag", + "value": "10GB", + "vartype": "string" + }, + { + "name": "cron.database", + "value": "postgres", + "vartype": "string" + } + ] + }, + + "delta_operations": [ + ] + }, + "compute_ctl_config": { + "jwks": { + "keys": [] + } + } +} diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json deleted file mode 100644 index 0308cab451..0000000000 --- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json +++ /dev/null @@ -1,141 +0,0 @@ -{ - "format_version": 1.0, - - "timestamp": "2022-10-12T18:00:00.000Z", - "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c", - - "cluster": { - "cluster_id": "docker_compose", - "name": "docker_compose_test", - "state": "restarted", - "roles": [ - { - "name": "cloud_admin", - "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8", - "options": null - } - ], - "databases": [ - ], - "settings": [ - { - "name": "fsync", - "value": "off", - "vartype": "bool" - }, - { - "name": "wal_level", - "value": "logical", - "vartype": "enum" - }, - { - "name": "wal_log_hints", - "value": "on", - "vartype": "bool" - }, - { - "name": "log_connections", - "value": "on", - "vartype": "bool" - }, - { - "name": "port", - "value": "55433", - "vartype": "integer" - }, - { - "name": "shared_buffers", - "value": "1MB", - "vartype": "string" - }, - { - "name": "max_connections", - "value": "100", - "vartype": "integer" - }, - { - "name": "listen_addresses", - "value": "0.0.0.0", - "vartype": "string" - }, - { - "name": "max_wal_senders", - "value": "10", - "vartype": "integer" - }, - { - "name": "max_replication_slots", - "value": "10", - "vartype": "integer" - }, - { - "name": "wal_sender_timeout", - "value": "5s", - "vartype": "string" - }, - { - "name": "wal_keep_size", - "value": "0", - "vartype": "integer" - }, - { - "name": "password_encryption", - "value": "md5", - "vartype": "enum" - }, - { - "name": "restart_after_crash", - "value": "off", - "vartype": "bool" - }, - { - "name": "synchronous_standby_names", - "value": "walproposer", - "vartype": "string" - }, - { - "name": "shared_preload_libraries", - "value": "neon,pg_cron,timescaledb,pg_stat_statements", - "vartype": "string" - }, - { - "name": "neon.safekeepers", - "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454", - "vartype": "string" - }, - { - "name": "neon.timeline_id", - "value": "TIMELINE_ID", - "vartype": "string" - }, - { - "name": "neon.tenant_id", - "value": "TENANT_ID", - "vartype": "string" - }, - { - "name": "neon.pageserver_connstring", - "value": "host=pageserver port=6400", - "vartype": "string" - }, - { - "name": "max_replication_write_lag", - "value": "500MB", - "vartype": "string" - }, - { - "name": "max_replication_flush_lag", - "value": "10GB", - "vartype": "string" - }, - { - "name": "cron.database", - "value": "postgres", - "vartype": "string" - } - ] - }, - - "delta_operations": [ - ] -} diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 493a0a5523..fd3ad1fffc 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -159,7 +159,7 @@ services: #- RUST_BACKTRACE=1 # Mount the test files directly, for faster editing cycle. volumes: - - ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/ + - ./compute_wrapper/var/db/postgres/configs/:/var/db/postgres/configs/ - ./compute_wrapper/shell/:/shell/ ports: - 55433:55433 # pg protocol handler diff --git a/docker-compose/ext-src/pg_jsonschema-src/Makefile b/docker-compose/ext-src/pg_jsonschema-src/Makefile new file mode 100644 index 0000000000..d79364d8b5 --- /dev/null +++ b/docker-compose/ext-src/pg_jsonschema-src/Makefile @@ -0,0 +1,8 @@ +EXTENSION = pg_jsonschema +DATA = pg_jsonschema--1.0.sql +REGRESS = jsonschema_valid_api jsonschema_edge_cases +REGRESS_OPTS = --load-extension=pg_jsonschema + +PG_CONFIG ?= pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_edge_cases.out b/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_edge_cases.out new file mode 100644 index 0000000000..f4089bfb13 --- /dev/null +++ b/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_edge_cases.out @@ -0,0 +1,87 @@ +-- Schema with enums, nulls, extra properties disallowed +SELECT jsonschema_is_valid('{ + "type": "object", + "properties": { + "status": { "type": "string", "enum": ["active", "inactive", "pending"] }, + "email": { "type": ["string", "null"], "format": "email" } + }, + "required": ["status"], + "additionalProperties": false +}'::json); + jsonschema_is_valid +--------------------- + t +(1 row) + +-- Valid enum and null email +SELECT jsonschema_validation_errors( + '{ + "type": "object", + "properties": { + "status": { "type": "string", "enum": ["active", "inactive", "pending"] }, + "email": { "type": ["string", "null"], "format": "email" } + }, + "required": ["status"], + "additionalProperties": false + }'::json, + '{"status": "active", "email": null}'::json +); + jsonschema_validation_errors +------------------------------ + {} +(1 row) + +-- Invalid enum value +SELECT jsonschema_validation_errors( + '{ + "type": "object", + "properties": { + "status": { "type": "string", "enum": ["active", "inactive", "pending"] }, + "email": { "type": ["string", "null"], "format": "email" } + }, + "required": ["status"], + "additionalProperties": false + }'::json, + '{"status": "disabled", "email": null}'::json +); + jsonschema_validation_errors +---------------------------------------------------------------------- + {"\"disabled\" is not one of [\"active\",\"inactive\",\"pending\"]"} +(1 row) + +-- Invalid email format (assuming format is validated) +SELECT jsonschema_validation_errors( + '{ + "type": "object", + "properties": { + "status": { "type": "string", "enum": ["active", "inactive", "pending"] }, + "email": { "type": ["string", "null"], "format": "email" } + }, + "required": ["status"], + "additionalProperties": false + }'::json, + '{"status": "active", "email": "not-an-email"}'::json +); + jsonschema_validation_errors +----------------------------------------- + {"\"not-an-email\" is not a \"email\""} +(1 row) + +-- Extra property not allowed +SELECT jsonschema_validation_errors( + '{ + "type": "object", + "properties": { + "status": { "type": "string", "enum": ["active", "inactive", "pending"] }, + "email": { "type": ["string", "null"], "format": "email" } + }, + "required": ["status"], + "additionalProperties": false + }'::json, + '{"status": "active", "extra": "should not be here"}'::json +); + jsonschema_validation_errors +-------------------------------------------------------------------- + {"Additional properties are not allowed ('extra' was unexpected)"} +(1 row) + diff --git a/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_valid_api.out b/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_valid_api.out new file mode 100644 index 0000000000..73f0a562e7 --- /dev/null +++ b/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_valid_api.out @@ -0,0 +1,65 @@ +-- Define schema +SELECT jsonschema_is_valid('{ + "type": "object", + "properties": { + "username": { "type": "string" }, + "age": { "type": "integer" } + }, + "required": ["username"] +}'::json); + jsonschema_is_valid +--------------------- + t +(1 row) + +-- Valid instance +SELECT jsonschema_validation_errors( + '{ + "type": "object", + "properties": { + "username": { "type": "string" }, + "age": { "type": "integer" } + }, + "required": ["username"] + }'::json, + '{"username": "alice", "age": 25}'::json +); + jsonschema_validation_errors +------------------------------ + {} +(1 row) + +-- Invalid instance: missing required "username" +SELECT jsonschema_validation_errors( + '{ + "type": "object", + "properties": { + "username": { "type": "string" }, + "age": { "type": "integer" } + }, + "required": ["username"] + }'::json, + '{"age": 25}'::json +); + jsonschema_validation_errors +----------------------------------------- + {"\"username\" is a required property"} +(1 row) + +-- Invalid instance: wrong type for "age" +SELECT jsonschema_validation_errors( + '{ + "type": "object", + "properties": { + "username": { "type": "string" }, + "age": { "type": "integer" } + }, + "required": ["username"] + }'::json, + '{"username": "bob", "age": "twenty"}'::json +); + jsonschema_validation_errors +------------------------------------------- + {"\"twenty\" is not of type \"integer\""} +(1 row) + diff --git a/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_edge_cases.sql b/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_edge_cases.sql new file mode 100644 index 0000000000..edad8cca16 --- /dev/null +++ b/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_edge_cases.sql @@ -0,0 +1,66 @@ +-- Schema with enums, nulls, extra properties disallowed +SELECT jsonschema_is_valid('{ + "type": "object", + "properties": { + "status": { "type": "string", "enum": ["active", "inactive", "pending"] }, + "email": { "type": ["string", "null"], "format": "email" } + }, + "required": ["status"], + "additionalProperties": false +}'::json); + +-- Valid enum and null email +SELECT jsonschema_validation_errors( + '{ + "type": "object", + "properties": { + "status": { "type": "string", "enum": ["active", "inactive", "pending"] }, + "email": { "type": ["string", "null"], "format": "email" } + }, + "required": ["status"], + "additionalProperties": false + }'::json, + '{"status": "active", "email": null}'::json +); + +-- Invalid enum value +SELECT jsonschema_validation_errors( + '{ + "type": "object", + "properties": { + "status": { "type": "string", "enum": ["active", "inactive", "pending"] }, + "email": { "type": ["string", "null"], "format": "email" } + }, + "required": ["status"], + "additionalProperties": false + }'::json, + '{"status": "disabled", "email": null}'::json +); + +-- Invalid email format (assuming format is validated) +SELECT jsonschema_validation_errors( + '{ + "type": "object", + "properties": { + "status": { "type": "string", "enum": ["active", "inactive", "pending"] }, + "email": { "type": ["string", "null"], "format": "email" } + }, + "required": ["status"], + "additionalProperties": false + }'::json, + '{"status": "active", "email": "not-an-email"}'::json +); + +-- Extra property not allowed +SELECT jsonschema_validation_errors( + '{ + "type": "object", + "properties": { + "status": { "type": "string", "enum": ["active", "inactive", "pending"] }, + "email": { "type": ["string", "null"], "format": "email" } + }, + "required": ["status"], + "additionalProperties": false + }'::json, + '{"status": "active", "extra": "should not be here"}'::json +); diff --git a/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_valid_api.sql b/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_valid_api.sql new file mode 100644 index 0000000000..44539ed6ce --- /dev/null +++ b/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_valid_api.sql @@ -0,0 +1,48 @@ +-- Define schema +SELECT jsonschema_is_valid('{ + "type": "object", + "properties": { + "username": { "type": "string" }, + "age": { "type": "integer" } + }, + "required": ["username"] +}'::json); + +-- Valid instance +SELECT jsonschema_validation_errors( + '{ + "type": "object", + "properties": { + "username": { "type": "string" }, + "age": { "type": "integer" } + }, + "required": ["username"] + }'::json, + '{"username": "alice", "age": 25}'::json +); + +-- Invalid instance: missing required "username" +SELECT jsonschema_validation_errors( + '{ + "type": "object", + "properties": { + "username": { "type": "string" }, + "age": { "type": "integer" } + }, + "required": ["username"] + }'::json, + '{"age": 25}'::json +); + +-- Invalid instance: wrong type for "age" +SELECT jsonschema_validation_errors( + '{ + "type": "object", + "properties": { + "username": { "type": "string" }, + "age": { "type": "integer" } + }, + "required": ["username"] + }'::json, + '{"username": "bob", "age": "twenty"}'::json +); diff --git a/docker-compose/ext-src/pg_session_jwt-src/Makefile b/docker-compose/ext-src/pg_session_jwt-src/Makefile new file mode 100644 index 0000000000..c61c9777ad --- /dev/null +++ b/docker-compose/ext-src/pg_session_jwt-src/Makefile @@ -0,0 +1,9 @@ +EXTENSION = pg_session_jwt + +REGRESS = basic_functions +REGRESS_OPTS = --load-extension=$(EXTENSION) +export PGOPTIONS = -c pg_session_jwt.jwk={"crv":"Ed25519","kty":"OKP","x":"R_Abz-63zJ00l-IraL5fQhwkhGVZCSooQFV5ntC3C7M"} + +PG_CONFIG ?= pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) \ No newline at end of file diff --git a/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out new file mode 100644 index 0000000000..ca54864ecd --- /dev/null +++ b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out @@ -0,0 +1,35 @@ +-- Basic functionality tests for pg_session_jwt +-- Test auth.init() function +SELECT auth.init(); + init +------ + +(1 row) + +-- Test an invalid JWT +SELECT auth.jwt_session_init('INVALID-JWT'); +ERROR: invalid JWT encoding +-- Test creating a session with an expired JWT +SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw'); +ERROR: Token used after it has expired +-- Test creating a session with a valid JWT +SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg'); + jwt_session_init +------------------ + +(1 row) + +-- Test auth.session() function +SELECT auth.session(); + session +------------------------------------------------------------------------- + {"exp": 4896164252, "iat": 1742564252, "jti": 434343, "sub": "user123"} +(1 row) + +-- Test auth.user_id() function +SELECT auth.user_id() AS user_id; + user_id +--------- + user123 +(1 row) + diff --git a/docker-compose/ext-src/pg_session_jwt-src/sql/basic_functions.sql b/docker-compose/ext-src/pg_session_jwt-src/sql/basic_functions.sql new file mode 100644 index 0000000000..6c1ab90c0c --- /dev/null +++ b/docker-compose/ext-src/pg_session_jwt-src/sql/basic_functions.sql @@ -0,0 +1,19 @@ +-- Basic functionality tests for pg_session_jwt + +-- Test auth.init() function +SELECT auth.init(); + +-- Test an invalid JWT +SELECT auth.jwt_session_init('INVALID-JWT'); + +-- Test creating a session with an expired JWT +SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw'); + +-- Test creating a session with a valid JWT +SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg'); + +-- Test auth.session() function +SELECT auth.session(); + +-- Test auth.user_id() function +SELECT auth.user_id() AS user_id; \ No newline at end of file diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 5fd4080c28..a6e2ac0f34 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -21,6 +21,7 @@ in this repository. - [WAL Redo](./pageserver-walredo.md) - [Page cache](./pageserver-pagecache.md) - [Storage](./pageserver-storage.md) + - [Compaction](./pageserver-compaction.md) - [Processing a GetPage request](./pageserver-processing-getpage.md) - [Processing WAL](./pageserver-processing-wal.md) diff --git a/docs/pageserver-compaction.md b/docs/pageserver-compaction.md new file mode 100644 index 0000000000..6cacb10c9c --- /dev/null +++ b/docs/pageserver-compaction.md @@ -0,0 +1,110 @@ +# Pageserver Compaction + +Lifted from . + +Updated 2025-03-26. + +## Pages and WAL + +Postgres stores data in 8 KB pages, identified by a page number. + +The WAL contains a sequence of page writes: either images (complete page contents) or deltas (patches applied to images). Each write is identified by its byte position in the WAL, aka LSN. + +Each page version is thus identified by page@LSN. Postgres may read pages at past LSNs. + +Pageservers ingest WAL by writing WAL records into a key/value store keyed by page@LSN. + +Pageservers materialize pages for Postgres reads by finding the most recent page image and applying all subsequent page deltas, up to the read LSN. + +## Compaction: Why? + +Pageservers store page@LSN keys in a key/value store using a custom variant of an LSM tree. Each timeline on each tenant shard has its own LSM tree. + +When Pageservers write new page@LSN entries, they are appended unordered to an ephemeral layer file. When the ephemeral layer file exceeds `checkpoint_distance` (default 256 MB), the key/value pairs are sorted by key and written out to a layer file (for efficient lookups). + +As WAL writes continue, more layer files accumulate. + +Reads must search through the layer files to find the page’s image and deltas. The more layer files accumulate, the more la yer files reads must search through before they find a page image, aka read amplification. + +Compaction’s job is to: + +- Reduce read amplification by reorganizing and combining layer files. +- Remove old garbage from layer files. + +As part of this, it may combine several page deltas into a single page image where possible. + +## Compaction: How? + +Neon uses a non-standard variant of an LSM tree made up of two levels of layer files: L0 and L1. + +Compaction runs in two phases: L0→L1 compaction, and L1 image compaction. + +L0 contains a stack of L0 layers at decreasing LSN ranges. These have been flushed sequentially from ephemeral layers. Each L0 layer covers the entire page space (page 0 to ~infinity) and the LSN range that was ingested into it. L0 layers are therefore particularly bad for read amp, since every read must search all L0 layers below the read LSN. For example: + +``` +| Page 0-99 @ LSN 0400-04ff | +| Page 0-99 @ LSN 0300-03ff | +| Page 0-99 @ LSN 0200-02ff | +| Page 0-99 @ LSN 0100-01ff | +| Page 0-99 @ LSN 0000-00ff | +``` + +L0→L1 compaction takes the bottom-most chunk of L0 layer files of between `compaction_threshold` (default 10) and `compaction_upper_limit` (default 20) layers. It uses merge-sort to write out sorted L1 delta layers of size `compaction_target_size` (default 128 MB). + +L1 typically consists of a “bed” of image layers with materialized page images at a specific LSN, and then delta layers of various page/LSN ranges above them with page deltas. For example: + +``` +Delta layers: | 30-84@0310-04ff | +Delta layers: | 10-42@0200-02ff | | 65-92@0174-02aa | +Image layers: | 0-39@0100 | 40-79@0100 | 80-99@0100 | +``` + +L1 image compaction scans across the L1 keyspace at some LSN, materializes page images by reading the image and delta layers below the LSN (via vectored reads), and writes out new sorted image layers of roughly size `compaction_target_size` (default 128 MB) at that LSN. + +Layer files below the new image files’ LSN can be garbage collected when they are no longer needed for PITR. + +Even though the old layer files are not immediately garbage collected, the new image layers help with read amp because reads can stop traversing the layer stack as soon as they encounter a page image. + +## Compaction: When? + +Pageservers run a `compaction_loop` background task for each tenant shard. Every `compaction_period` (default 20 seconds) it will wake up and check if any of the shard’s timelines need compaction. Additionally, L0 layer flushes will eagerly wake the compaction loop if the L0 count exceeds `compaction_threshold` (default 10). + +L0 compaction runs if the number of L0 layers exceeds `compaction_threshold` (default 10). + +L1 image compaction runs across sections of the L1 keyspace that have at least `image_creation_threshold` (default 3) delta layers overlapping image layers. + +At most `CONCURRENT_BACKGROUND_TASKS` (default 3 / 4 * CPUs = 6) background tasks can run concurrently on a Pageserver, including compaction. Further compaction tasks must wait. + +Because L0 layers cause the most read amp (they overlap the entire keyspace and only contain page deltas), they are aggressively compacted down: + +- L0 is compacted down across all tenant timelines before L1 compaction is attempted (`compaction_l0_first`). +- L0 compaction uses a separate concurrency limit of `CONCURRENT_L0_COMPACTION_TASKS` (default 3 / 4 * CPUs = 6) to avoid waiting for other tasks (`compaction_l0_semaphore`). +- If L0 compaction is needed on any tenant timeline, L1 image compaction will yield to start an immediate L0 compaction run (except for compaction run via admin APIs). + +## Backpressure + +With sustained heavy write loads, new L0 layers may be flushed faster than they can be compacted down. This can cause an unbounded buildup of read amplification and compaction debt, which can take hours to resolve even after the writes stop. + +To avoid this and allow compaction to keep up, layer flushes will slow writes down to apply backpressure on the workload: + +- At `l0_flush_delay_threshold` (default 30) L0 layers, layer flushes are delayed by the flush duration, such that they take 2x as long. +- At `l0_flush_stall_threshold` (default disabled) L0 layers, layer flushes stall entirely until the L0 count falls back below the threshold. This is currently disabled because we don’t trust L0 compaction to be responsive enough. + +This backpressure is propagated to the compute by waiting for layer flushes when WAL ingestion rolls the ephemeral layer. The compute will significantly slow down WAL writes at: + +- `max_replication_write_lag` (default 500 MB), when Pageserver WAL ingestion lags +- `max_replication_flush_lag` (default 10 GB), when Pageserver L0 flushes lag + +Combined, this means that the compute will backpressure when there are 30 L0 layers (30 * 256 MB = 7.7 GB) and the Pageserver WAL ingestion lags the compute by 500 MB, for a total of ~8 GB L0+ephemeral compaction debt on a single shard. + +Since we only delay L0 flushes by 2x when backpressuring, and haven’t enabled stalls, it is still possible for read amp to increase unbounded if compaction is too slow (although we haven’t seen this in practice). But this is considered better than stalling flushes and causing unavailability for as long as it takes L0 compaction to react, since we don’t trust it to be fast enough — at the expense of continually increasing read latency and CPU usage for this tenant. We should either enable stalls when we have enough confidence in L0 compaction, or scale the flush delay by the number of L0 layers to apply increasing backpressure. + +## Circuit Breaker + +Compaction can fail, often repeatedly. This can happen e.g. due to data corruption, faulty hardware, S3 outages, etc. + +If compaction fails, the compaction loop will naïvely try and fail again almost immediately. It may only fail after doing a significant amount of wasted work, while holding onto the background task semaphore. + +To avoid repeatedly doing wasted work and starving out other compaction jobs, each tenant has a compaction circuit breaker. After 5 repeated compaction failures, the circuit breaker trips and disables compaction for the next 24 hours, before resetting the breaker and trying again. This disables compaction across all tenant timelines (faulty or not). + +Disabling compaction for a long time is dangerous, since it can lead to unbounded read amp and compaction debt, and continuous workload backpressure. However, continually failing would not help either. Tripped circuit breakers trigger an alert and must be investigated promptly. \ No newline at end of file diff --git a/docs/storage_controller.md b/docs/storage_controller.md index ac4aca4219..d761210033 100644 --- a/docs/storage_controller.md +++ b/docs/storage_controller.md @@ -151,7 +151,7 @@ Example body: ``` { "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc", - "stripe_size": 32768, + "stripe_size": 2048, "shards": [ {"node_id": 344, "shard_number": 0}, {"node_id": 722, "shard_number": 1}, diff --git a/explained_queries.sql b/explained_queries.sql new file mode 100644 index 0000000000..e69de29bb2 diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index d88451c549..98f2fc297c 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -5,6 +5,14 @@ use crate::privilege::Privilege; use crate::responses::ComputeCtlConfig; use crate::spec::{ComputeSpec, ExtVersion, PgIdent}; +/// When making requests to the `compute_ctl` external HTTP server, the client +/// must specify a set of claims in `Authorization` header JWTs such that +/// `compute_ctl` can authorize the request. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct ComputeClaims { + pub compute_id: String, +} + /// Request of the /configure API /// /// We now pass only `spec` in the configuration request, but later we can @@ -30,9 +38,3 @@ pub struct SetRoleGrantsRequest { pub privileges: Vec, pub role: PgIdent, } - -/// Request of the /configure_telemetry API -#[derive(Debug, Deserialize, Serialize)] -pub struct ConfigureTelemetryRequest { - pub logs_export_host: Option, -} diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index c8f6019c5c..353949736b 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -14,6 +14,32 @@ pub struct GenericAPIError { pub error: String, } +/// All configuration parameters necessary for a compute. When +/// [`ComputeConfig::spec`] is provided, it means that the compute is attached +/// to a tenant. [`ComputeConfig::compute_ctl_config`] will always be provided +/// and contains parameters necessary for operating `compute_ctl` independently +/// of whether a tenant is attached to the compute or not. +/// +/// This also happens to be the body of `compute_ctl`'s /configure request. +#[derive(Debug, Deserialize, Serialize)] +pub struct ComputeConfig { + /// The compute spec + pub spec: Option, + + /// The compute_ctl configuration + #[allow(dead_code)] + pub compute_ctl_config: ComputeCtlConfig, +} + +impl From for ComputeConfig { + fn from(value: ControlPlaneConfigResponse) -> Self { + Self { + spec: value.spec, + compute_ctl_config: value.compute_ctl_config, + } + } +} + #[derive(Debug, Clone, Serialize)] pub struct ExtensionInstallResponse { pub extension: PgIdent, @@ -161,7 +187,7 @@ pub struct TlsConfig { /// Response of the `/computes/{compute_id}/spec` control-plane API. #[derive(Deserialize, Debug)] -pub struct ControlPlaneSpecResponse { +pub struct ControlPlaneConfigResponse { pub spec: Option, pub status: ControlPlaneComputeStatus, pub compute_ctl_config: ComputeCtlConfig, diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index cff1f4c89a..5e67ccce00 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -1,8 +1,8 @@ -//! `ComputeSpec` represents the contents of the spec.json file. -//! -//! The spec.json file is used to pass information to 'compute_ctl'. It contains -//! all the information needed to start up the right version of PostgreSQL, -//! and connect it to the storage nodes. +//! The ComputeSpec contains all the information needed to start up +//! the right version of PostgreSQL, and connect it to the storage nodes. +//! It can be passed as part of the `config.json`, or the control plane can +//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or +//! compute_ctl can fetch it by calling the control plane's API. use std::collections::HashMap; use indexmap::IndexMap; @@ -104,6 +104,12 @@ pub struct ComputeSpec { pub timeline_id: Option, pub pageserver_connstring: Option, + // More neon ids that we expose to the compute_ctl + // and to postgres as neon extension GUCs. + pub project_id: Option, + pub branch_id: Option, + pub endpoint_id: Option, + /// Safekeeper membership config generation. It is put in /// neon.safekeepers GUC and serves two purposes: /// 1) Non zero value forces walproposer to use membership configurations. @@ -159,15 +165,13 @@ pub struct ComputeSpec { #[serde(default)] // Default false pub drop_subscriptions_before_start: bool, - /// Log level for audit logging: - /// - /// Disabled - no audit logging. This is the default. - /// log - log masked statements to the postgres log using pgaudit extension - /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension - /// - /// Extensions should be present in shared_preload_libraries + /// Log level for compute audit logging #[serde(default)] pub audit_log_level: ComputeAudit, + + /// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding. + /// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514 + pub logs_export_host: Option, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. @@ -179,9 +183,6 @@ pub enum ComputeFeature { /// track short-lived connections as user activity. ActivityMonitorExperimental, - /// Allow to configure rsyslog for Postgres logs export - PostgresLogsExport, - /// This is a special feature flag that is used to represent unknown feature flags. /// Basically all unknown to enum flags are represented as this one. See unit test /// `parse_unknown_features()` for more details. @@ -288,14 +289,25 @@ impl ComputeMode { } /// Log level for audit logging -/// Disabled, log, hipaa -/// Default is Disabled #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] pub enum ComputeAudit { #[default] Disabled, + // Deprecated, use Base instead Log, + // (pgaudit.log = 'ddl', pgaudit.log_parameter='off') + // logged to the standard postgresql log stream + Base, + // Deprecated, use Full or Extended instead Hipaa, + // (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off') + // logged to separate files collected by rsyslog + // into dedicated log storage with strict access + Extended, + // (pgaudit.log='all', pgaudit.log_parameter='on'), + // logged to separate files collected by rsyslog + // into dedicated log storage with strict access. + Full, } #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml index 6d24ee352a..5f6578f76e 100644 --- a/libs/http-utils/Cargo.toml +++ b/libs/http-utils/Cargo.toml @@ -30,6 +30,7 @@ tokio.workspace = true tracing.workspace = true url.workspace = true uuid.workspace = true +x509-cert.workspace = true # to use tokio channels as streams, this is faster to compile than async_stream # why is it only here? no other crate should use it, streams are rarely needed. diff --git a/libs/http-utils/src/server.rs b/libs/http-utils/src/server.rs index 33e4915e99..f93f71c962 100644 --- a/libs/http-utils/src/server.rs +++ b/libs/http-utils/src/server.rs @@ -4,6 +4,8 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use hyper0::Body; use hyper0::server::conn::Http; +use metrics::{IntCounterVec, register_int_counter_vec}; +use once_cell::sync::Lazy; use routerify::{RequestService, RequestServiceBuilder}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_rustls::TlsAcceptor; @@ -26,6 +28,24 @@ pub struct Server { tls_acceptor: Option, } +static CONNECTION_STARTED_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "http_server_connection_started_total", + "Number of established http/https connections", + &["scheme"] + ) + .expect("failed to define a metric") +}); + +static CONNECTION_ERROR_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "http_server_connection_errors_total", + "Number of occured connection errors by type", + &["type"] + ) + .expect("failed to define a metric") +}); + impl Server { pub fn new( request_service: Arc>, @@ -60,6 +80,15 @@ impl Server { false } + let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]); + let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]); + let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]); + let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]); + let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]); + + let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]); + let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]); + let mut connections = FuturesUnordered::new(); loop { tokio::select! { @@ -67,6 +96,7 @@ impl Server { let (tcp_stream, remote_addr) = match stream { Ok(stream) => stream, Err(err) => { + tcp_error_cnt.inc(); if !suppress_io_error(&err) { info!("Failed to accept TCP connection: {err:#}"); } @@ -78,11 +108,18 @@ impl Server { let tls_acceptor = self.tls_acceptor.clone(); let cancel = cancel.clone(); + let tls_error_cnt = tls_error_cnt.clone(); + let http_error_cnt = http_error_cnt.clone(); + let https_error_cnt = https_error_cnt.clone(); + let http_connection_cnt = http_connection_cnt.clone(); + let https_connection_cnt = https_connection_cnt.clone(); + connections.push(tokio::spawn( async move { match tls_acceptor { Some(tls_acceptor) => { // Handle HTTPS connection. + https_connection_cnt.inc(); let tls_stream = tokio::select! { tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream, _ = cancel.cancelled() => return, @@ -90,23 +127,27 @@ impl Server { let tls_stream = match tls_stream { Ok(tls_stream) => tls_stream, Err(err) => { + tls_error_cnt.inc(); if !suppress_io_error(&err) { - info!("Failed to accept TLS connection: {err:#}"); + info!(%remote_addr, "Failed to accept TLS connection: {err:#}"); } return; } }; if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await { + https_error_cnt.inc(); if !suppress_hyper_error(&err) { - info!("Failed to serve HTTPS connection: {err:#}"); + info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}"); } } } None => { // Handle HTTP connection. + http_connection_cnt.inc(); if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await { + http_error_cnt.inc(); if !suppress_hyper_error(&err) { - info!("Failed to serve HTTP connection: {err:#}"); + info!(%remote_addr, "Failed to serve HTTP connection: {err:#}"); } } } @@ -115,6 +156,7 @@ impl Server { } Some(conn) = connections.next() => { if let Err(err) = conn { + panic_error_cnt.inc(); error!("Connection panicked: {err:#}"); } } @@ -122,6 +164,7 @@ impl Server { // Wait for graceful shutdown of all connections. while let Some(conn) = connections.next().await { if let Err(err) = conn { + panic_error_cnt.inc(); error!("Connection panicked: {err:#}"); } } diff --git a/libs/http-utils/src/tls_certs.rs b/libs/http-utils/src/tls_certs.rs index 0c18d84d98..2799db78a6 100644 --- a/libs/http-utils/src/tls_certs.rs +++ b/libs/http-utils/src/tls_certs.rs @@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration}; use anyhow::Context; use arc_swap::ArcSwap; use camino::Utf8Path; +use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec}; +use once_cell::sync::Lazy; use rustls::{ - pki_types::{CertificateDer, PrivateKeyDer}, + pki_types::{CertificateDer, PrivateKeyDer, UnixTime}, server::{ClientHello, ResolvesServerCert}, sign::CertifiedKey, }; +use x509_cert::der::Reader; pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result>> { let cert_data = tokio::fs::read(filename) @@ -53,6 +56,76 @@ pub async fn load_certified_key( Ok(certified_key) } +/// rustls's CertifiedKey with extra parsed fields used for metrics. +struct ParsedCertifiedKey { + certified_key: CertifiedKey, + expiration_time: UnixTime, +} + +/// Parse expiration time from an X509 certificate. +fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result { + let parsed_cert = x509_cert::der::SliceReader::new(cert) + .context("Failed to parse cerficiate")? + .decode::() + .context("Failed to parse cerficiate")?; + + Ok(UnixTime::since_unix_epoch( + parsed_cert + .tbs_certificate + .validity + .not_after + .to_unix_duration(), + )) +} + +async fn load_and_parse_certified_key( + key_filename: &Utf8Path, + cert_filename: &Utf8Path, +) -> anyhow::Result { + let certified_key = load_certified_key(key_filename, cert_filename).await?; + let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?; + Ok(ParsedCertifiedKey { + certified_key, + expiration_time, + }) +} + +static CERT_EXPIRATION_TIME: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "tls_certs_expiration_time_seconds", + "Expiration time of the loaded certificate since unix epoch in seconds", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + +static CERT_RELOAD_STARTED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "tls_certs_reload_started_total", + "Number of certificate reload loop iterations started", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + +static CERT_RELOAD_UPDATED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "tls_certs_reload_updated_total", + "Number of times the certificate was updated to the new one", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + +static CERT_RELOAD_FAILED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "tls_certs_reload_failed_total", + "Number of times the certificate reload failed", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + /// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from /// the disk periodically. #[derive(Debug)] @@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver { impl ReloadingCertificateResolver { /// Creates a new Resolver by loading certificate and private key from FS and /// creating tokio::task to reload them with provided reload_period. + /// resolver_name is used as metric's label. pub async fn new( + resolver_name: &str, key_filename: &Utf8Path, cert_filename: &Utf8Path, reload_period: Duration, ) -> anyhow::Result> { + // Create metrics for current resolver. + let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]); + let cert_reload_started_counter = + CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]); + let cert_reload_updated_counter = + CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]); + let cert_reload_failed_counter = + CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]); + + let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?; + let this = Arc::new(Self { - certified_key: ArcSwap::from_pointee( - load_certified_key(key_filename, cert_filename).await?, - ), + certified_key: ArcSwap::from_pointee(parsed_key.certified_key), }); + cert_expiration_time.set(parsed_key.expiration_time.as_secs()); tokio::spawn({ let weak_this = Arc::downgrade(&this); @@ -88,17 +173,22 @@ impl ReloadingCertificateResolver { Some(this) => this, None => break, // Resolver has been destroyed, exit. }; - match load_certified_key(&key_filename, &cert_filename).await { - Ok(new_certified_key) => { - if new_certified_key.cert == this.certified_key.load().cert { + cert_reload_started_counter.inc(); + + match load_and_parse_certified_key(&key_filename, &cert_filename).await { + Ok(parsed_key) => { + if parsed_key.certified_key.cert == this.certified_key.load().cert { tracing::debug!("Certificate has not changed since last reloading"); } else { tracing::info!("Certificate has been reloaded"); - this.certified_key.store(Arc::new(new_certified_key)); + this.certified_key.store(Arc::new(parsed_key.certified_key)); + cert_expiration_time.set(parsed_key.expiration_time.as_secs()); + cert_reload_updated_counter.inc(); } last_reload_failed = false; } Err(err) => { + cert_reload_failed_counter.inc(); // Note: Reloading certs may fail if it conflicts with the script updating // the files at the same time. Warn only if the error is persistent. if last_reload_failed { diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 87dfdfb5ec..688e9de6e7 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -34,6 +34,7 @@ postgres_backend.workspace = true nix = {workspace = true, optional = true} reqwest.workspace = true rand.workspace = true +tracing-utils.workspace = true [dev-dependencies] bincode.workspace = true diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 47c3136113..53b68afb0f 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -51,9 +51,54 @@ pub struct NodeMetadata { /// If there cannot be a static default value because we need to make runtime /// checks to determine the default, make it an `Option` (which defaults to None). /// The runtime check should be done in the consuming crate, i.e., `pageserver`. +/// +/// Unknown fields are silently ignored during deserialization. +/// The alternative, which we used in the past, was to set `deny_unknown_fields`, +/// which fails deserialization, and hence pageserver startup, if there is an unknown field. +/// The reason we don't do that anymore is that it complicates +/// usage of config fields for feature flagging, which we commonly do for +/// region-by-region rollouts. +/// The complications mainly arise because the `pageserver.toml` contents on a +/// prod server have a separate lifecycle from the pageserver binary. +/// For instance, `pageserver.toml` contents today are defined in the internal +/// infra repo, and thus introducing a new config field to pageserver and +/// rolling it out to prod servers are separate commits in separate repos +/// that can't be made or rolled back atomically. +/// Rollbacks in particular pose a risk with deny_unknown_fields because +/// the old pageserver binary may reject a new config field, resulting in +/// an outage unless the person doing the pageserver rollback remembers +/// to also revert the commit that added the config field in to the +/// `pageserver.toml` templates in the internal infra repo. +/// (A pre-deploy config check would eliminate this risk during rollbacks, +/// cf [here](https://github.com/neondatabase/cloud/issues/24349).) +/// In addition to this compatibility problem during emergency rollbacks, +/// deny_unknown_fields adds further complications when decomissioning a feature +/// flag: with deny_unknown_fields, we can't remove a flag from the [`ConfigToml`] +/// until all prod servers' `pageserver.toml` files have been updated to a version +/// that doesn't specify the flag. Otherwise new software would fail to start up. +/// This adds the requirement for an intermediate step where the new config field +/// is accepted but ignored, prolonging the decomissioning process by an entire +/// release cycle. +/// By contrast with unknown fields silently ignored, decomissioning a feature +/// flag is a one-step process: we can skip the intermediate step and straight +/// remove the field from the [`ConfigToml`]. We leave the field in the +/// `pageserver.toml` files on prod servers until we reach certainty that we +/// will not roll back to old software whose behavior was dependent on config. +/// Then we can remove the field from the templates in the internal infra repo. +/// This process is [documented internally]( +/// https://docs.neon.build/storage/pageserver_configuration.html). +/// +/// Note that above relaxed compatbility for the config format does NOT APPLY +/// TO THE STORAGE FORMAT. As general guidance, when introducing storage format +/// changes, ensure that the potential rollback target version will be compatible +/// with the new format. This must hold regardless of what flags are set in in the `pageserver.toml`: +/// any format version that exists in an environment must be compatible with the software that runs there. +/// Use a pageserver.toml flag only to gate whether software _writes_ the new format. +/// For more compatibility considerations, refer to [internal docs]( +/// https://docs.neon.build/storage/compat.html?highlight=compat#format-versions--compatibility) #[serde_as] #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] -#[serde(default, deny_unknown_fields)] +#[serde(default)] pub struct ConfigToml { // types mapped 1:1 into the runtime PageServerConfig type pub listen_pg_addr: String, @@ -134,10 +179,11 @@ pub struct ConfigToml { pub load_previous_heatmap: Option, #[serde(skip_serializing_if = "Option::is_none")] pub generate_unarchival_heatmap: Option, + pub tracing: Option, + pub enable_tls_page_service_api: bool, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -#[serde(deny_unknown_fields)] pub struct DiskUsageEvictionTaskConfig { pub max_usage_pct: utils::serde_percent::Percent, pub min_avail_bytes: u64, @@ -152,17 +198,19 @@ pub struct DiskUsageEvictionTaskConfig { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(tag = "mode", rename_all = "kebab-case")] -#[serde(deny_unknown_fields)] pub enum PageServicePipeliningConfig { Serial, Pipelined(PageServicePipeliningConfigPipelined), } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -#[serde(deny_unknown_fields)] pub struct PageServicePipeliningConfigPipelined { /// Causes runtime errors if larger than max get_vectored batch size. pub max_batch_size: NonZeroUsize, pub execution: PageServiceProtocolPipelinedExecutionStrategy, + // The default below is such that new versions of the software can start + // with the old configuration. + #[serde(default)] + pub batching: PageServiceProtocolPipelinedBatchingStrategy, } #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -172,9 +220,21 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy { Tasks, } +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum PageServiceProtocolPipelinedBatchingStrategy { + /// All get page requests in a batch will be at the same LSN + #[default] + UniformLsn, + /// Get page requests in a batch may be at different LSN + /// + /// One key cannot be present more than once at different LSNs in + /// the same batch. + ScatteredLsn, +} + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(tag = "mode", rename_all = "kebab-case")] -#[serde(deny_unknown_fields)] pub enum GetVectoredConcurrentIo { /// The read path is fully sequential: layers are visited /// one after the other and IOs are issued and waited upon @@ -191,6 +251,54 @@ pub enum GetVectoredConcurrentIo { SidecarTask, } +#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct Ratio { + pub numerator: usize, + pub denominator: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct OtelExporterConfig { + pub endpoint: String, + pub protocol: OtelExporterProtocol, + #[serde(with = "humantime_serde")] + pub timeout: Duration, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum OtelExporterProtocol { + Grpc, + HttpBinary, + HttpJson, +} + +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct Tracing { + pub sampling_ratio: Ratio, + pub export_config: OtelExporterConfig, +} + +impl From<&OtelExporterConfig> for tracing_utils::ExportConfig { + fn from(val: &OtelExporterConfig) -> Self { + tracing_utils::ExportConfig { + endpoint: Some(val.endpoint.clone()), + protocol: val.protocol.into(), + timeout: val.timeout, + } + } +} + +impl From for tracing_utils::Protocol { + fn from(val: OtelExporterProtocol) -> Self { + match val { + OtelExporterProtocol::Grpc => tracing_utils::Protocol::Grpc, + OtelExporterProtocol::HttpJson => tracing_utils::Protocol::HttpJson, + OtelExporterProtocol::HttpBinary => tracing_utils::Protocol::HttpBinary, + } + } +} + pub mod statvfs { pub mod mock { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -245,7 +353,7 @@ pub struct MaxVectoredReadBytes(pub NonZeroUsize); /// Tenant-level configuration values, used for various purposes. #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -#[serde(deny_unknown_fields, default)] +#[serde(default)] pub struct TenantConfigToml { // Flush out an inmemory layer, if it's holding WAL older than this // This puts a backstop on how much WAL needs to be re-digested if the @@ -361,12 +469,17 @@ pub struct TenantConfigToml { // gc-compaction related configs /// Enable automatic gc-compaction trigger on this tenant. pub gc_compaction_enabled: bool, + /// Enable verification of gc-compaction results. + pub gc_compaction_verification: bool, /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold, /// gc-compaction will be triggered. pub gc_compaction_initial_threshold_kb: u64, /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN) /// is above this ratio, gc-compaction will be triggered. pub gc_compaction_ratio_percent: u64, + /// Tenant level performance sampling ratio override. Controls the ratio of get page requests + /// that will get perf sampling for the tenant. + pub sampling_ratio: Option, } pub mod defaults { @@ -519,9 +632,12 @@ impl Default for ConfigToml { page_service_pipelining: if !cfg!(test) { PageServicePipeliningConfig::Serial } else { + // Do not turn this into the default until scattered reads have been + // validated and rolled-out fully. PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { max_batch_size: NonZeroUsize::new(32).unwrap(), execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures, + batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn, }) }, get_vectored_concurrent_io: if !cfg!(test) { @@ -537,6 +653,8 @@ impl Default for ConfigToml { validate_wal_contiguity: None, load_previous_heatmap: None, generate_unarchival_heatmap: None, + tracing: None, + enable_tls_page_service_api: false, } } } @@ -596,6 +714,7 @@ pub mod tenant_conf_defaults { // image layers should be created. pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false; + pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true; pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; } @@ -650,8 +769,10 @@ impl Default for TenantConfigToml { wal_receiver_protocol_override: None, rel_size_v2_enabled: false, gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED, + gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION, gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB, gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT, + sampling_ratio: None, } } } diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 3cb62f9d18..91f9c03ba4 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -7,7 +7,8 @@ use std::time::{Duration, Instant}; /// API (`/control/v1` prefix). Implemented by the server /// in [`storage_controller::http`] use serde::{Deserialize, Serialize}; -use utils::id::{NodeId, TenantId}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; use crate::models::{PageserverUtilization, ShardParameters, TenantConfig}; use crate::shard::{ShardStripeSize, TenantShardId}; @@ -499,6 +500,15 @@ pub struct SafekeeperSchedulingPolicyRequest { pub scheduling_policy: SkSchedulingPolicy, } +/// Import request for safekeeper timelines. +#[derive(Serialize, Deserialize, Clone)] +pub struct TimelineImportRequest { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub start_lsn: Lsn, + pub sk_set: Vec, +} + #[cfg(test)] mod test { use serde_json; diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 8836e7ec87..0c4d7fd4cb 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -927,7 +927,7 @@ impl Key { /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`. #[inline(always)] - pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> { + pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> { Ok(match self.field1 { 0x00 => ( RelTag { @@ -938,7 +938,7 @@ impl Key { }, self.field6, ), - _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1), + _ => return Err(ToRelBlockError(self.field1)), }) } } @@ -951,6 +951,17 @@ impl std::str::FromStr for Key { } } +#[derive(Debug)] +pub struct ToRelBlockError(u8); + +impl fmt::Display for ToRelBlockError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "unexpected value kind 0x{:02x}", self.0) + } +} + +impl std::error::Error for ToRelBlockError {} + #[cfg(test)] mod tests { use std::str::FromStr; diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index e505f23e49..79e3ef553b 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -613,8 +613,7 @@ mod tests { use rand::{RngCore, SeedableRng}; use super::*; - use crate::models::ShardParameters; - use crate::shard::{ShardCount, ShardNumber}; + use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber, ShardStripeSize}; // Helper function to create a key range. // @@ -964,12 +963,8 @@ mod tests { } #[test] fn sharded_range_relation_gap() { - let shard_identity = ShardIdentity::new( - ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, - ) - .unwrap(); + let shard_identity = + ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); let range = ShardedRange::new( Range { @@ -985,12 +980,8 @@ mod tests { #[test] fn shard_identity_keyspaces_single_key() { - let shard_identity = ShardIdentity::new( - ShardNumber(1), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, - ) - .unwrap(); + let shard_identity = + ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); let range = ShardedRange::new( Range { @@ -1034,12 +1025,8 @@ mod tests { #[test] fn shard_identity_keyspaces_forkno_gap() { - let shard_identity = ShardIdentity::new( - ShardNumber(1), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, - ) - .unwrap(); + let shard_identity = + ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); let range = ShardedRange::new( Range { @@ -1061,7 +1048,7 @@ mod tests { let shard_identity = ShardIdentity::new( ShardNumber(shard_number), ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + DEFAULT_STRIPE_SIZE, ) .unwrap(); @@ -1144,37 +1131,44 @@ mod tests { /// for a single tenant. #[test] fn sharded_range_fragment_simple() { + const SHARD_COUNT: u8 = 4; + const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0; + let shard_identity = ShardIdentity::new( ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + ShardCount::new(SHARD_COUNT), + ShardStripeSize(STRIPE_SIZE), ) .unwrap(); // A range which we happen to know covers exactly one stripe which belongs to this shard let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); - let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap(); + let mut input_end = input_start; + input_end.field6 += STRIPE_SIZE; // field6 is block number // Ask for stripe_size blocks, we get the whole stripe assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 32768), - (32768, vec![(32768, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE), + (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)]) ); // Ask for more, we still get the whole stripe assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 10000000), - (32768, vec![(32768, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, 10 * STRIPE_SIZE), + (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)]) ); // Ask for target_nblocks of half the stripe size, we get two halves assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 16384), + do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE / 2), ( - 32768, + STRIPE_SIZE, vec![ - (16384, input_start..input_start.add(16384)), - (16384, input_start.add(16384)..input_end) + ( + STRIPE_SIZE / 2, + input_start..input_start.add(STRIPE_SIZE / 2) + ), + (STRIPE_SIZE / 2, input_start.add(STRIPE_SIZE / 2)..input_end) ] ) ); @@ -1182,40 +1176,53 @@ mod tests { #[test] fn sharded_range_fragment_multi_stripe() { + const SHARD_COUNT: u8 = 4; + const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0; + const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE; + let shard_identity = ShardIdentity::new( ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + ShardCount::new(SHARD_COUNT), + ShardStripeSize(STRIPE_SIZE), ) .unwrap(); // A range which covers multiple stripes, exactly one of which belongs to the current shard. let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); - let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + let mut input_end = input_start; + input_end.field6 += RANGE_SIZE; // field6 is block number + // Ask for all the blocks, get a fragment that covers the whole range but reports // its size to be just the blocks belonging to our shard. assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 131072), - (32768, vec![(32768, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, RANGE_SIZE), + (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)]) ); - // Ask for a sub-stripe quantity + // Ask for a sub-stripe quantity that results in 3 fragments. + let limit = STRIPE_SIZE / 3 + 1; assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 16000), + do_fragment(input_start, input_end, &shard_identity, limit), ( - 32768, + STRIPE_SIZE, vec![ - (16000, input_start..input_start.add(16000)), - (16000, input_start.add(16000)..input_start.add(32000)), - (768, input_start.add(32000)..input_end), + (limit, input_start..input_start.add(limit)), + (limit, input_start.add(limit)..input_start.add(2 * limit)), + ( + STRIPE_SIZE - 2 * limit, + input_start.add(2 * limit)..input_end + ), ] ) ); // Try on a range that starts slightly after our owned stripe assert_eq!( - do_fragment(input_start.add(1), input_end, &shard_identity, 131072), - (32767, vec![(32767, input_start.add(1)..input_end)]) + do_fragment(input_start.add(1), input_end, &shard_identity, RANGE_SIZE), + ( + STRIPE_SIZE - 1, + vec![(STRIPE_SIZE - 1, input_start.add(1)..input_end)] + ) ); } @@ -1223,32 +1230,40 @@ mod tests { /// a previous relation. #[test] fn sharded_range_fragment_starting_from_logical_size() { + const SHARD_COUNT: u8 = 4; + const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0; + const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE; + let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap(); - let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap(); + let mut input_end = Key::from_hex("000000067f00000001000000ae0100000000").unwrap(); + input_end.field6 += RANGE_SIZE; // field6 is block number // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too let shard_identity = ShardIdentity::new( ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + ShardCount::new(SHARD_COUNT), + ShardStripeSize(STRIPE_SIZE), ) .unwrap(); assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 0x10000), - (0x8001, vec![(0x8001, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE), + ( + STRIPE_SIZE + 1, + vec![(STRIPE_SIZE + 1, input_start..input_end)] + ) ); // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards // store all logical sizes) let shard_identity = ShardIdentity::new( ShardNumber(1), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + ShardCount::new(SHARD_COUNT), + ShardStripeSize(STRIPE_SIZE), ) .unwrap(); assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 0x10000), - (0x1, vec![(0x1, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE), + (1, vec![(1, input_start..input_end)]) ); } @@ -1284,12 +1299,8 @@ mod tests { ); // Same, but using a sharded identity - let shard_identity = ShardIdentity::new( - ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, - ) - .unwrap(); + let shard_identity = + ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); assert_eq!( do_fragment(input_start, input_end, &shard_identity, 0x8000), (u32::MAX, vec![(u32::MAX, input_start..input_end),]) @@ -1331,7 +1342,7 @@ mod tests { ShardIdentity::new( ShardNumber((prng.next_u32() % shard_count) as u8), ShardCount::new(shard_count as u8), - ShardParameters::DEFAULT_STRIPE_SIZE, + DEFAULT_STRIPE_SIZE, ) .unwrap() }; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 0a7c9717ca..f491ed10e1 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -23,9 +23,10 @@ use utils::lsn::Lsn; use utils::postgres_client::PostgresClientProtocol; use utils::{completion, serde_system_time}; +use crate::config::Ratio; use crate::key::{CompactKey, Key}; use crate::reltag::RelTag; -use crate::shard::{ShardCount, ShardStripeSize, TenantShardId}; +use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId}; /// The state of a tenant in this pageserver. /// @@ -79,10 +80,22 @@ pub enum TenantState { /// /// Transitions out of this state are possible through `set_broken()`. Stopping { + /// The barrier can be used to wait for shutdown to complete. The first caller to set + /// Some(Barrier) is responsible for driving shutdown to completion. Subsequent callers + /// will wait for the first caller's existing barrier. + /// + /// None is set when an attach is cancelled, to signal to shutdown that the attach has in + /// fact cancelled: + /// + /// 1. `shutdown` sees `TenantState::Attaching`, and cancels the tenant. + /// 2. `attach` sets `TenantState::Stopping(None)` and exits. + /// 3. `set_stopping` waits for `TenantState::Stopping(None)` and sets + /// `TenantState::Stopping(Some)` to claim the barrier as the shutdown owner. + // // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field, // otherwise it will not be skipped during deserialization #[serde(skip)] - progress: completion::Barrier, + progress: Option, }, /// The tenant is recognized by the pageserver, but can no longer be used for /// any operations. @@ -425,8 +438,6 @@ pub struct ShardParameters { } impl ShardParameters { - pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); - pub fn is_unsharded(&self) -> bool { self.count.is_unsharded() } @@ -436,7 +447,7 @@ impl Default for ShardParameters { fn default() -> Self { Self { count: ShardCount::new(0), - stripe_size: Self::DEFAULT_STRIPE_SIZE, + stripe_size: DEFAULT_STRIPE_SIZE, } } } @@ -565,9 +576,13 @@ pub struct TenantConfigPatch { #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_enabled: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub gc_compaction_verification: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_initial_threshold_kb: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_ratio_percent: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub sampling_ratio: FieldPatch>, } /// Like [`crate::config::TenantConfigToml`], but preserves the information @@ -683,11 +698,17 @@ pub struct TenantConfig { #[serde(skip_serializing_if = "Option::is_none")] pub gc_compaction_enabled: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub gc_compaction_verification: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub gc_compaction_initial_threshold_kb: Option, #[serde(skip_serializing_if = "Option::is_none")] pub gc_compaction_ratio_percent: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub sampling_ratio: Option>, } impl TenantConfig { @@ -728,8 +749,10 @@ impl TenantConfig { mut wal_receiver_protocol_override, mut rel_size_v2_enabled, mut gc_compaction_enabled, + mut gc_compaction_verification, mut gc_compaction_initial_threshold_kb, mut gc_compaction_ratio_percent, + mut sampling_ratio, } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); @@ -818,12 +841,16 @@ impl TenantConfig { patch .gc_compaction_enabled .apply(&mut gc_compaction_enabled); + patch + .gc_compaction_verification + .apply(&mut gc_compaction_verification); patch .gc_compaction_initial_threshold_kb .apply(&mut gc_compaction_initial_threshold_kb); patch .gc_compaction_ratio_percent .apply(&mut gc_compaction_ratio_percent); + patch.sampling_ratio.apply(&mut sampling_ratio); Ok(Self { checkpoint_distance, @@ -858,8 +885,10 @@ impl TenantConfig { wal_receiver_protocol_override, rel_size_v2_enabled, gc_compaction_enabled, + gc_compaction_verification, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, + sampling_ratio, }) } @@ -955,12 +984,16 @@ impl TenantConfig { gc_compaction_enabled: self .gc_compaction_enabled .unwrap_or(global_conf.gc_compaction_enabled), + gc_compaction_verification: self + .gc_compaction_verification + .unwrap_or(global_conf.gc_compaction_verification), gc_compaction_initial_threshold_kb: self .gc_compaction_initial_threshold_kb .unwrap_or(global_conf.gc_compaction_initial_threshold_kb), gc_compaction_ratio_percent: self .gc_compaction_ratio_percent .unwrap_or(global_conf.gc_compaction_ratio_percent), + sampling_ratio: self.sampling_ratio.unwrap_or(global_conf.sampling_ratio), } } } @@ -1094,7 +1127,7 @@ pub struct CompactionAlgorithmSettings { } #[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)] -#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +#[serde(tag = "mode", rename_all = "kebab-case")] pub enum L0FlushConfig { #[serde(rename_all = "snake_case")] Direct { max_concurrency: NonZeroUsize }, @@ -1418,11 +1451,6 @@ pub struct TimelineInfo { pub last_record_lsn: Lsn, pub prev_record_lsn: Option, - /// Legacy field, retained for one version to enable old storage controller to - /// decode (it was a mandatory field). - #[serde(default, rename = "latest_gc_cutoff_lsn")] - pub _unused: Lsn, - /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients. /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead, /// as it is easier to reason about. @@ -1663,6 +1691,7 @@ pub struct SecondaryProgress { pub struct TenantScanRemoteStorageShard { pub tenant_shard_id: TenantShardId, pub generation: Option, + pub stripe_size: Option, } #[derive(Serialize, Deserialize, Debug, Default)] @@ -2714,10 +2743,15 @@ mod tests { "Activating", ), (line!(), TenantState::Active, "Active"), + ( + line!(), + TenantState::Stopping { progress: None }, + "Stopping", + ), ( line!(), TenantState::Stopping { - progress: utils::completion::Barrier::default(), + progress: Some(completion::Barrier::default()), }, "Stopping", ), diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs index fda504a26e..73516c5220 100644 --- a/libs/pageserver_api/src/record.rs +++ b/libs/pageserver_api/src/record.rs @@ -58,6 +58,8 @@ pub enum NeonWalRecord { /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and /// its references in `timeline.rs`. will_init: bool, + /// Only append the record if the current image is the same as the one specified in this field. + only_if: Option, }, } @@ -81,6 +83,17 @@ impl NeonWalRecord { append: s.as_ref().to_string(), clear: false, will_init: false, + only_if: None, + } + } + + #[cfg(feature = "testing")] + pub fn wal_append_conditional(s: impl AsRef, only_if: impl AsRef) -> Self { + Self::Test { + append: s.as_ref().to_string(), + clear: false, + will_init: false, + only_if: Some(only_if.as_ref().to_string()), } } @@ -90,6 +103,7 @@ impl NeonWalRecord { append: s.as_ref().to_string(), clear: true, will_init: false, + only_if: None, } } @@ -99,6 +113,7 @@ impl NeonWalRecord { append: s.as_ref().to_string(), clear: true, will_init: true, + only_if: None, } } } diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 8386d6e586..feb59f5070 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -78,6 +78,12 @@ impl Default for ShardStripeSize { } } +impl std::fmt::Display for ShardStripeSize { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + /// Layout version: for future upgrades where we might change how the key->shard mapping works #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)] pub struct ShardLayout(u8); @@ -86,8 +92,11 @@ const LAYOUT_V1: ShardLayout = ShardLayout(1); /// ShardIdentity uses a magic layout value to indicate if it is unusable const LAYOUT_BROKEN: ShardLayout = ShardLayout(255); -/// Default stripe size in pages: 256MiB divided by 8kiB page size. -const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); +/// The default stripe size in pages. 16 MiB divided by 8 kiB page size. +/// +/// A lower stripe size distributes ingest load better across shards, but reduces IO amortization. +/// 16 MiB appears to be a reasonable balance: . +pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(16 * 1024 / 8); #[derive(thiserror::Error, Debug, PartialEq, Eq)] pub enum ShardConfigError { @@ -537,7 +546,7 @@ mod tests { field6: 0x7d06, }; - let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key); + let shard = key_to_shard_number(ShardCount(10), ShardStripeSize(32768), &key); assert_eq!(shard, ShardNumber(8)); } diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index a0a891f0dc..654dde8da6 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -5,7 +5,6 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] use std::future::Future; -use std::io::ErrorKind; use std::net::SocketAddr; use std::os::fd::{AsRawFd, RawFd}; use std::pin::Pin; @@ -227,7 +226,7 @@ impl MaybeWriteOnly { match self { MaybeWriteOnly::Full(framed) => framed.read_startup_message().await, MaybeWriteOnly::WriteOnly(_) => { - Err(io::Error::new(ErrorKind::Other, "reading from write only half").into()) + Err(io::Error::other("reading from write only half").into()) } MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), } @@ -237,7 +236,7 @@ impl MaybeWriteOnly { match self { MaybeWriteOnly::Full(framed) => framed.read_message().await, MaybeWriteOnly::WriteOnly(_) => { - Err(io::Error::new(ErrorKind::Other, "reading from write only half").into()) + Err(io::Error::other("reading from write only half").into()) } MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), } @@ -975,7 +974,7 @@ impl AsyncWrite for CopyDataWriter<'_, IO> { .write_message_noflush(&BeMessage::CopyData(buf)) // write_message only writes to the buffer, so it can fail iff the // message is invaid, but CopyData can't be invalid. - .map_err(|_| io::Error::new(ErrorKind::Other, "failed to serialize CopyData"))?; + .map_err(|_| io::Error::other("failed to serialize CopyData"))?; Poll::Ready(Ok(buf.len())) } diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index 907ef9eed3..75ca123014 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -85,8 +85,8 @@ static KEY: Lazy> = Lazy::new(|| { static CERT: Lazy> = Lazy::new(|| { let mut cursor = Cursor::new(include_bytes!("cert.pem")); - let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap(); - cert + + rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap() }); // test that basic select with ssl works diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs index 8e216d0f44..4e5e48ecf5 100644 --- a/libs/pq_proto/src/framed.rs +++ b/libs/pq_proto/src/framed.rs @@ -35,7 +35,7 @@ impl ConnectionError { pub fn into_io_error(self) -> io::Error { match self { ConnectionError::Io(io) => io, - ConnectionError::Protocol(pe) => io::Error::new(io::ErrorKind::Other, pe.to_string()), + ConnectionError::Protocol(pe) => io::Error::other(pe.to_string()), } } } diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index e435ffbf7e..e7afc64564 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -257,7 +257,7 @@ pub enum ProtocolError { impl ProtocolError { /// Proxy stream.rs uses only io::Error; provide it. pub fn into_io_error(self) -> io::Error { - io::Error::new(io::ErrorKind::Other, self.to_string()) + io::Error::other(self.to_string()) } } diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs index 27e05e24ec..2daf9a80d4 100644 --- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs +++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs @@ -212,7 +212,7 @@ impl ScramSha256 { password, channel_binding, } => (nonce, password, channel_binding), - _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")), + _ => return Err(io::Error::other("invalid SCRAM state")), }; let message = @@ -291,7 +291,7 @@ impl ScramSha256 { server_key, auth_message, } => (server_key, auth_message), - _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")), + _ => return Err(io::Error::other("invalid SCRAM state")), }; let message = @@ -301,10 +301,7 @@ impl ScramSha256 { let verifier = match parsed { ServerFinalMessage::Error(e) => { - return Err(io::Error::new( - io::ErrorKind::Other, - format!("SCRAM error: {}", e), - )); + return Err(io::Error::other(format!("SCRAM error: {}", e))); } ServerFinalMessage::Verifier(verifier) => verifier, }; diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 7bdf340f74..bd18d80915 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -28,7 +28,7 @@ toml_edit.workspace = true tracing.workspace = true scopeguard.workspace = true metrics.workspace = true -utils.workspace = true +utils = { path = "../utils", default-features = false } pin-project-lite.workspace = true azure_core.workspace = true diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index dee61a410d..18146c5464 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -801,8 +801,7 @@ where // that support needs to be hacked in. // // including {self:?} into the message would be useful, but unsure how to unproject. - _ => std::task::Poll::Ready(Err(std::io::Error::new( - std::io::ErrorKind::Other, + _ => std::task::Poll::Ready(Err(std::io::Error::other( "cloned or initial values cannot be read", ))), } @@ -855,7 +854,7 @@ where }; Err(azure_core::error::Error::new( azure_core::error::ErrorKind::Io, - std::io::Error::new(std::io::ErrorKind::Other, msg), + std::io::Error::other(msg), )) } diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 6996bb27ae..d38e13fd05 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -558,7 +558,7 @@ async fn upload_large_enough_file( ) -> usize { let header = bytes::Bytes::from_static("remote blob data content".as_bytes()); let body = bytes::Bytes::from(vec![0u8; 1024]); - let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128)); + let contents = std::iter::once(header).chain(std::iter::repeat_n(body, 128)); let len = contents.clone().fold(0, |acc, next| acc + next.len()); diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 33ff636a79..51f88625da 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -71,6 +71,7 @@ pub struct PeerInfo { pub ts: Instant, pub pg_connstr: String, pub http_connstr: String, + pub https_connstr: Option, } pub type FullTransactionId = u64; @@ -227,6 +228,8 @@ pub struct TimelineDeleteResult { pub dir_existed: bool, } +pub type TenantDeleteResult = std::collections::HashMap; + fn lsn_invalid() -> Lsn { Lsn::INVALID } @@ -259,6 +262,8 @@ pub struct SkTimelineInfo { pub safekeeper_connstr: Option, #[serde(default)] pub http_connstr: Option, + #[serde(default)] + pub https_connstr: Option, // Minimum of all active RO replicas flush LSN #[serde(default = "lsn_invalid")] pub standby_horizon: Lsn, diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index 60637d5b24..49a6055b1e 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -14,6 +14,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true +pin-project-lite.workspace = true [dev-dependencies] tracing-subscriber.workspace = true # For examples in docs diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs index 74992a7d03..0893aa173b 100644 --- a/libs/tracing-utils/src/lib.rs +++ b/libs/tracing-utils/src/lib.rs @@ -31,10 +31,10 @@ //! .init(); //! } //! ``` -#![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] pub mod http; +pub mod perf_span; use opentelemetry::KeyValue; use opentelemetry::trace::TracerProvider; diff --git a/libs/tracing-utils/src/perf_span.rs b/libs/tracing-utils/src/perf_span.rs new file mode 100644 index 0000000000..16f713c67e --- /dev/null +++ b/libs/tracing-utils/src/perf_span.rs @@ -0,0 +1,144 @@ +//! Crutch module to work around tracing infrastructure deficiencies +//! +//! We wish to collect granular request spans without impacting performance +//! by much. Ideally, we should have zero overhead for a sampling rate of 0. +//! +//! The approach taken by the pageserver crate is to use a completely different +//! span hierarchy for the performance spans. Spans are explicitly stored in +//! the request context and use a different [`tracing::Subscriber`] in order +//! to avoid expensive filtering. +//! +//! [`tracing::Span`] instances record their [`tracing::Dispatch`] and, implcitly, +//! their [`tracing::Subscriber`] at creation time. However, upon exiting the span, +//! the global default [`tracing::Dispatch`] is used. This is problematic if one +//! wishes to juggle different subscribers. +//! +//! In order to work around this, this module provides a [`PerfSpan`] type which +//! wraps a [`Span`] and sets the default subscriber when exiting the span. This +//! achieves the correct routing. +//! +//! There's also a modified version of [`tracing::Instrument`] which works with +//! [`PerfSpan`]. + +use core::{ + future::Future, + marker::Sized, + mem::ManuallyDrop, + pin::Pin, + task::{Context, Poll}, +}; +use pin_project_lite::pin_project; +use tracing::{Dispatch, span::Span}; + +#[derive(Debug, Clone)] +pub struct PerfSpan { + inner: ManuallyDrop, + dispatch: Dispatch, +} + +#[must_use = "once a span has been entered, it should be exited"] +pub struct PerfSpanEntered<'a> { + span: &'a PerfSpan, +} + +impl PerfSpan { + pub fn new(span: Span, dispatch: Dispatch) -> Self { + Self { + inner: ManuallyDrop::new(span), + dispatch, + } + } + + pub fn enter(&self) -> PerfSpanEntered { + if let Some(ref id) = self.inner.id() { + self.dispatch.enter(id); + } + + PerfSpanEntered { span: self } + } + + pub fn inner(&self) -> &Span { + &self.inner + } +} + +impl Drop for PerfSpan { + fn drop(&mut self) { + // Bring the desired dispatch into scope before explicitly calling + // the span destructor. This routes the span exit to the correct + // [`tracing::Subscriber`]. + let _dispatch_guard = tracing::dispatcher::set_default(&self.dispatch); + // SAFETY: ManuallyDrop in Drop implementation + unsafe { ManuallyDrop::drop(&mut self.inner) } + } +} + +impl Drop for PerfSpanEntered<'_> { + fn drop(&mut self) { + assert!(self.span.inner.id().is_some()); + + let _dispatch_guard = tracing::dispatcher::set_default(&self.span.dispatch); + self.span.dispatch.exit(&self.span.inner.id().unwrap()); + } +} + +pub trait PerfInstrument: Sized { + fn instrument(self, span: PerfSpan) -> PerfInstrumented { + PerfInstrumented { + inner: ManuallyDrop::new(self), + span, + } + } +} + +pin_project! { + #[project = PerfInstrumentedProj] + #[derive(Debug, Clone)] + #[must_use = "futures do nothing unless you `.await` or poll them"] + pub struct PerfInstrumented { + // `ManuallyDrop` is used here to to enter instrument `Drop` by entering + // `Span` and executing `ManuallyDrop::drop`. + #[pin] + inner: ManuallyDrop, + span: PerfSpan, + } + + impl PinnedDrop for PerfInstrumented { + fn drop(this: Pin<&mut Self>) { + let this = this.project(); + let _enter = this.span.enter(); + // SAFETY: 1. `Pin::get_unchecked_mut()` is safe, because this isn't + // different from wrapping `T` in `Option` and calling + // `Pin::set(&mut this.inner, None)`, except avoiding + // additional memory overhead. + // 2. `ManuallyDrop::drop()` is safe, because + // `PinnedDrop::drop()` is guaranteed to be called only + // once. + unsafe { ManuallyDrop::drop(this.inner.get_unchecked_mut()) } + } + } +} + +impl<'a, T> PerfInstrumentedProj<'a, T> { + /// Get a mutable reference to the [`Span`] a pinned mutable reference to + /// the wrapped type. + fn span_and_inner_pin_mut(self) -> (&'a mut PerfSpan, Pin<&'a mut T>) { + // SAFETY: As long as `ManuallyDrop` does not move, `T` won't move + // and `inner` is valid, because `ManuallyDrop::drop` is called + // only inside `Drop` of the `Instrumented`. + let inner = unsafe { self.inner.map_unchecked_mut(|v| &mut **v) }; + (self.span, inner) + } +} + +impl Future for PerfInstrumented { + type Output = T::Output; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let (span, inner) = self.project().span_and_inner_pin_mut(); + let _enter = span.enter(); + inner.poll(cx) + } +} + +impl PerfInstrument for T {} diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 4180602ac7..fd2fa63fd0 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -5,7 +5,8 @@ edition.workspace = true license.workspace = true [features] -default = [] +default = ["rename_noreplace"] +rename_noreplace = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions testing = ["fail/failpoints"] @@ -35,7 +36,7 @@ serde_with.workspace = true serde_json.workspace = true signal-hook.workspace = true thiserror.workspace = true -tokio.workspace = true +tokio = { workspace = true, features = ["signal"] } tokio-tar.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = ["serde"] } diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index cc5b0b1d13..db4fc5685c 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -173,7 +173,7 @@ impl std::fmt::Debug for JwtAuth { } // this function is used only for testing purposes in CLI e g generate tokens during init -pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result { +pub fn encode_from_key_file(claims: &S, key_data: &[u8]) -> Result { let key = EncodingKey::from_ed_pem(key_data)?; Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?) } diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 290a5b2686..215fa36df4 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -81,12 +81,9 @@ pub fn path_with_suffix_extension( } pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> { - let parent = file_path.parent().ok_or_else(|| { - io::Error::new( - io::ErrorKind::Other, - format!("File {file_path:?} has no parent"), - ) - })?; + let parent = file_path + .parent() + .ok_or_else(|| io::Error::other(format!("File {file_path:?} has no parent")))?; fsync(file_path)?; fsync(parent)?; diff --git a/libs/utils/src/elapsed_accum.rs b/libs/utils/src/elapsed_accum.rs new file mode 100644 index 0000000000..efb2a34a95 --- /dev/null +++ b/libs/utils/src/elapsed_accum.rs @@ -0,0 +1,26 @@ +use std::time::{Duration, Instant}; + +#[derive(Default)] +pub struct ElapsedAccum { + accum: Duration, +} + +impl ElapsedAccum { + pub fn get(&self) -> Duration { + self.accum + } + pub fn guard(&mut self) -> impl Drop + '_ { + let start = Instant::now(); + scopeguard::guard(start, |last_wait_at| { + self.accum += Instant::now() - last_wait_at; + }) + } + + pub async fn measure(&mut self, fut: Fut) -> O + where + Fut: Future, + { + let _guard = self.guard(); + fut.await + } +} diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs index a406ab0378..e16edaaa9a 100644 --- a/libs/utils/src/fs_ext.rs +++ b/libs/utils/src/fs_ext.rs @@ -3,7 +3,9 @@ use std::{fs, io, path::Path}; use anyhow::Context; +#[cfg(feature = "rename_noreplace")] mod rename_noreplace; +#[cfg(feature = "rename_noreplace")] pub use rename_noreplace::rename_noreplace; pub trait PathExt { diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs index fc6f794b57..d0c07353d0 100644 --- a/libs/utils/src/fs_ext/rename_noreplace.rs +++ b/libs/utils/src/fs_ext/rename_noreplace.rs @@ -8,7 +8,7 @@ pub fn rename_noreplace( dst: &P2, ) -> nix::Result<()> { { - #[cfg(target_os = "linux")] + #[cfg(all(target_os = "linux", target_env = "gnu"))] { nix::fcntl::renameat2( None, @@ -29,7 +29,7 @@ pub fn rename_noreplace( })??; nix::errno::Errno::result(res).map(drop) } - #[cfg(not(any(target_os = "linux", target_os = "macos")))] + #[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "macos")))] { std::compile_error!("OS does not support no-replace renames"); } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 9389a27bf3..206b8bbd8f 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -93,6 +93,8 @@ pub mod try_rcu; pub mod guard_arc_swap; +pub mod elapsed_accum; + #[cfg(target_os = "linux")] pub mod linux_socket_ioctl; diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs index f2be1957c4..426bb65916 100644 --- a/libs/utils/src/signals.rs +++ b/libs/utils/src/signals.rs @@ -1,6 +1,8 @@ pub use signal_hook::consts::TERM_SIGNALS; pub use signal_hook::consts::signal::*; use signal_hook::iterator::Signals; +use tokio::signal::unix::{SignalKind, signal}; +use tracing::info; pub enum Signal { Quit, @@ -36,3 +38,30 @@ impl ShutdownSignals { Ok(()) } } + +/// Runs in a loop since we want to be responsive to multiple signals +/// even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown) +/// +pub async fn signal_handler(token: tokio_util::sync::CancellationToken) { + let mut sigint = signal(SignalKind::interrupt()).unwrap(); + let mut sigterm = signal(SignalKind::terminate()).unwrap(); + let mut sigquit = signal(SignalKind::quit()).unwrap(); + + loop { + let signal = tokio::select! { + _ = sigquit.recv() => { + info!("Got signal SIGQUIT. Terminating in immediate shutdown mode."); + std::process::exit(111); + } + _ = sigint.recv() => "SIGINT", + _ = sigterm.recv() => "SIGTERM", + }; + + if !token.is_cancelled() { + info!("Got signal {signal}. Terminating gracefully in fast shutdown mode."); + token.cancel(); + } else { + info!("Got signal {signal}. Already shutting down."); + } + } +} diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 8f8401b35d..5fb4c5b460 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -111,9 +111,17 @@ impl OnceCell { } } + /// Like [`Self::get_or_init_detached_measured`], but without out parameter for time spent waiting. + pub async fn get_or_init_detached(&self) -> Result, InitPermit> { + self.get_or_init_detached_measured(None).await + } + /// Returns a guard to an existing initialized value, or returns an unique initialization /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`. - pub async fn get_or_init_detached(&self) -> Result, InitPermit> { + pub async fn get_or_init_detached_measured( + &self, + mut wait_time: Option<&mut crate::elapsed_accum::ElapsedAccum>, + ) -> Result, InitPermit> { // It looks like OnceCell::get_or_init could be implemented using this method instead of // duplication. However, that makes the future be !Send due to possibly holding on to the // MutexGuard over an await point. @@ -125,12 +133,16 @@ impl OnceCell { } guard.init_semaphore.clone() }; - { let permit = { // increment the count for the duration of queued let _guard = CountWaitingInitializers::start(self); - sem.acquire().await + let fut = sem.acquire(); + if let Some(wait_time) = wait_time.as_mut() { + wait_time.measure(fut).await + } else { + fut.await + } }; let Ok(permit) = permit else { diff --git a/object_storage/Cargo.toml b/object_storage/Cargo.toml new file mode 100644 index 0000000000..17fbaefe6f --- /dev/null +++ b/object_storage/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "object_storage" +version = "0.0.1" +edition.workspace = true +license.workspace = true +[dependencies] +anyhow.workspace = true +axum-extra.workspace = true +axum.workspace = true +camino.workspace = true +futures.workspace = true +jsonwebtoken.workspace = true +prometheus.workspace = true +remote_storage.workspace = true +serde.workspace = true +serde_json.workspace = true +tokio-util.workspace = true +tokio.workspace = true +tracing.workspace = true +utils = { path = "../libs/utils", default-features = false } +workspace_hack.workspace = true +[dev-dependencies] +camino-tempfile.workspace = true +http-body-util.workspace = true +itertools.workspace = true +rand.workspace = true +test-log.workspace = true +tower.workspace = true diff --git a/object_storage/src/app.rs b/object_storage/src/app.rs new file mode 100644 index 0000000000..7b5627f0db --- /dev/null +++ b/object_storage/src/app.rs @@ -0,0 +1,561 @@ +use anyhow::anyhow; +use axum::body::{Body, Bytes}; +use axum::response::{IntoResponse, Response}; +use axum::{Router, http::StatusCode}; +use object_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok}; +use remote_storage::TimeoutOrCancel; +use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, RemotePath}; +use std::{sync::Arc, time::SystemTime, time::UNIX_EPOCH}; +use tokio_util::sync::CancellationToken; +use tracing::{error, info}; +use utils::backoff::retry; + +pub fn app(state: Arc) -> Router<()> { + use axum::routing::{delete as _delete, get as _get}; + let delete_prefix = _delete(delete_prefix); + Router::new() + .route( + "/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}", + _get(get).put(set).delete(delete), + ) + .route( + "/{tenant_id}/{timeline_id}/{endpoint_id}", + delete_prefix.clone(), + ) + .route("/{tenant_id}/{timeline_id}", delete_prefix.clone()) + .route("/{tenant_id}", delete_prefix) + .route("/metrics", _get(metrics)) + .route("/status", _get(async || StatusCode::OK.into_response())) + .with_state(state) +} + +type Result = anyhow::Result; +type State = axum::extract::State>; + +const CONTENT_TYPE: &str = "content-type"; +const APPLICATION_OCTET_STREAM: &str = "application/octet-stream"; +const WARN_THRESHOLD: u32 = 3; +const MAX_RETRIES: u32 = 10; + +async fn metrics() -> Result { + prometheus::TextEncoder::new() + .encode_to_string(&prometheus::gather()) + .map(|s| s.into_response()) + .map_err(|e| internal_error(e, "/metrics", "collecting metrics")) +} + +async fn get(S3Path { path }: S3Path, state: State) -> Result { + info!(%path, "downloading"); + let download_err = |e| { + if let DownloadError::NotFound = e { + info!(%path, %e, "downloading"); // 404 is not an issue of _this_ service + return not_found(&path); + } + internal_error(e, &path, "downloading") + }; + let cancel = state.cancel.clone(); + let opts = &DownloadOpts::default(); + + let stream = retry( + async || state.storage.download(&path, opts, &cancel).await, + DownloadError::is_permanent, + WARN_THRESHOLD, + MAX_RETRIES, + "downloading", + &cancel, + ) + .await + .unwrap_or(Err(DownloadError::Cancelled)) + .map_err(download_err)? + .download_stream; + + Response::builder() + .status(StatusCode::OK) + .header(CONTENT_TYPE, APPLICATION_OCTET_STREAM) + .body(Body::from_stream(stream)) + .map_err(|e| internal_error(e, path, "reading response")) +} + +// Best solution for files is multipart upload, but remote_storage doesn't support it, +// so we can either read Bytes in memory and push at once or forward BodyDataStream to +// remote_storage. The latter may seem more peformant, but BodyDataStream doesn't have a +// guaranteed size() which may produce issues while uploading to s3. +// So, currently we're going with an in-memory copy plus a boundary to prevent uploading +// very large files. +async fn set(S3Path { path }: S3Path, state: State, bytes: Bytes) -> Result { + info!(%path, "uploading"); + let request_len = bytes.len(); + let max_len = state.max_upload_file_limit; + if request_len > max_len { + return Err(bad_request( + anyhow!("File size {request_len} exceeds max {max_len}"), + "uploading", + )); + } + + let cancel = state.cancel.clone(); + let fun = async || { + let stream = bytes_to_stream(bytes.clone()); + state + .storage + .upload(stream, request_len, &path, None, &cancel) + .await + }; + retry( + fun, + TimeoutOrCancel::caused_by_cancel, + WARN_THRESHOLD, + MAX_RETRIES, + "uploading", + &cancel, + ) + .await + .unwrap_or(Err(anyhow!("uploading cancelled"))) + .map_err(|e| internal_error(e, path, "reading response"))?; + Ok(ok()) +} + +async fn delete(S3Path { path }: S3Path, state: State) -> Result { + info!(%path, "deleting"); + let cancel = state.cancel.clone(); + retry( + async || state.storage.delete(&path, &cancel).await, + TimeoutOrCancel::caused_by_cancel, + WARN_THRESHOLD, + MAX_RETRIES, + "deleting", + &cancel, + ) + .await + .unwrap_or(Err(anyhow!("deleting cancelled"))) + .map_err(|e| internal_error(e, path, "deleting"))?; + Ok(ok()) +} + +async fn delete_prefix(PrefixS3Path { path }: PrefixS3Path, state: State) -> Result { + info!(%path, "deleting prefix"); + let cancel = state.cancel.clone(); + retry( + async || state.storage.delete_prefix(&path, &cancel).await, + TimeoutOrCancel::caused_by_cancel, + WARN_THRESHOLD, + MAX_RETRIES, + "deleting prefix", + &cancel, + ) + .await + .unwrap_or(Err(anyhow!("deleting prefix cancelled"))) + .map_err(|e| internal_error(e, path, "deleting prefix"))?; + Ok(ok()) +} + +pub async fn check_storage_permissions( + client: &GenericRemoteStorage, + cancel: CancellationToken, +) -> anyhow::Result<()> { + info!("storage permissions check"); + + // as_nanos() as multiple instances proxying same bucket may be started at once + let now = SystemTime::now() + .duration_since(UNIX_EPOCH)? + .as_nanos() + .to_string(); + + let path = RemotePath::from_string(&format!("write_access_{now}"))?; + info!(%path, "uploading"); + + let body = now.to_string(); + let stream = bytes_to_stream(Bytes::from(body.clone())); + client + .upload(stream, body.len(), &path, None, &cancel) + .await?; + + use tokio::io::AsyncReadExt; + info!(%path, "downloading"); + let download_opts = DownloadOpts { + kind: remote_storage::DownloadKind::Small, + ..Default::default() + }; + let mut body_read_buf = Vec::new(); + let stream = client + .download(&path, &download_opts, &cancel) + .await? + .download_stream; + tokio_util::io::StreamReader::new(stream) + .read_to_end(&mut body_read_buf) + .await?; + let body_read = String::from_utf8(body_read_buf)?; + if body != body_read { + error!(%body, %body_read, "File contents do not match"); + anyhow::bail!("Read back file doesn't match original") + } + + info!(%path, "removing"); + client.delete(&path, &cancel).await +} + +fn bytes_to_stream(bytes: Bytes) -> impl futures::Stream> { + futures::stream::once(futures::future::ready(Ok(bytes))) +} + +#[cfg(test)] +mod tests { + use super::*; + use axum::{body::Body, extract::Request, response::Response}; + use http_body_util::BodyExt; + use itertools::iproduct; + use std::env::var; + use std::sync::Arc; + use std::time::Duration; + use test_log::test as testlog; + use tower::{Service, util::ServiceExt}; + use utils::id::{TenantId, TimelineId}; + + // see libs/remote_storage/tests/test_real_s3.rs + const REAL_S3_ENV: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; + const REAL_S3_BUCKET: &str = "REMOTE_STORAGE_S3_BUCKET"; + const REAL_S3_REGION: &str = "REMOTE_STORAGE_S3_REGION"; + + async fn proxy() -> (Storage, Option) { + let cancel = CancellationToken::new(); + let (dir, storage) = if var(REAL_S3_ENV).is_err() { + // tests execute in parallel and we need a new directory for each of them + let dir = camino_tempfile::tempdir().unwrap(); + let fs = + remote_storage::LocalFs::new(dir.path().into(), Duration::from_secs(5)).unwrap(); + (Some(dir), GenericRemoteStorage::LocalFs(fs)) + } else { + // test_real_s3::create_s3_client is hard to reference, reimplementing here + let millis = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis(); + use rand::Rng; + let random = rand::thread_rng().r#gen::(); + + let s3_config = remote_storage::S3Config { + bucket_name: var(REAL_S3_BUCKET).unwrap(), + bucket_region: var(REAL_S3_REGION).unwrap(), + prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")), + endpoint: None, + concurrency_limit: std::num::NonZeroUsize::new(100).unwrap(), + max_keys_per_list_response: None, + upload_storage_class: None, + }; + let bucket = remote_storage::S3Bucket::new(&s3_config, Duration::from_secs(1)) + .await + .unwrap(); + (None, GenericRemoteStorage::AwsS3(Arc::new(bucket))) + }; + + let proxy = Storage { + auth: object_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(), + storage, + cancel: cancel.clone(), + max_upload_file_limit: usize::MAX, + }; + check_storage_permissions(&proxy.storage, cancel) + .await + .unwrap(); + (proxy, dir) + } + + // see libs/utils/src/auth.rs + const TEST_PUB_KEY_ED25519: &[u8] = b" +-----BEGIN PUBLIC KEY----- +MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w= +-----END PUBLIC KEY----- +"; + + const TEST_PRIV_KEY_ED25519: &[u8] = br#" +-----BEGIN PRIVATE KEY----- +MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH +-----END PRIVATE KEY----- +"#; + + async fn request(req: Request) -> Response { + let (proxy, _) = proxy().await; + app(Arc::new(proxy)) + .into_service() + .oneshot(req) + .await + .unwrap() + } + + #[testlog(tokio::test)] + async fn status() { + let res = Request::builder() + .uri("/status") + .body(Body::empty()) + .map(request) + .unwrap() + .await; + assert_eq!(res.status(), StatusCode::OK); + } + + fn routes() -> impl Iterator { + iproduct!( + vec!["/1", "/1/2", "/1/2/3", "/1/2/3/4"], + vec!["GET", "PUT", "DELETE"] + ) + } + + #[testlog(tokio::test)] + async fn no_token() { + for (uri, method) in routes() { + info!(%uri, %method); + let res = Request::builder() + .uri(uri) + .method(method) + .body(Body::empty()) + .map(request) + .unwrap() + .await; + assert!(matches!( + res.status(), + StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST + )); + } + } + + #[testlog(tokio::test)] + async fn invalid_token() { + for (uri, method) in routes() { + info!(%uri, %method); + let status = Request::builder() + .uri(uri) + .header("Authorization", "Bearer 123") + .method(method) + .body(Body::empty()) + .map(request) + .unwrap() + .await; + assert!(matches!( + status.status(), + StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST + )); + } + } + + const TENANT_ID: TenantId = + TenantId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]); + const TIMELINE_ID: TimelineId = + TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]); + const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg"; + fn token() -> String { + let claims = object_storage::Claims { + tenant_id: TENANT_ID, + timeline_id: TIMELINE_ID, + endpoint_id: ENDPOINT_ID.into(), + exp: u64::MAX, + }; + let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap(); + let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO); + jsonwebtoken::encode(&header, &claims, &key).unwrap() + } + + #[testlog(tokio::test)] + async fn unauthorized() { + let (proxy, _) = proxy().await; + let mut app = app(Arc::new(proxy)).into_service(); + let token = token(); + let args = itertools::iproduct!( + vec![TENANT_ID.to_string(), TenantId::generate().to_string()], + vec![TIMELINE_ID.to_string(), TimelineId::generate().to_string()], + vec![ENDPOINT_ID, "ep-ololo"] + ) + .skip(1); + + for ((uri, method), (tenant, timeline, endpoint)) in iproduct!(routes(), args) { + info!(%uri, %method, %tenant, %timeline, %endpoint); + let request = Request::builder() + .uri(format!("/{tenant}/{timeline}/{endpoint}/sub/path/key")) + .method(method) + .header("Authorization", format!("Bearer {}", token)) + .body(Body::empty()) + .unwrap(); + let status = ServiceExt::ready(&mut app) + .await + .unwrap() + .call(request) + .await + .unwrap() + .status(); + assert_eq!(status, StatusCode::UNAUTHORIZED); + } + } + + #[testlog(tokio::test)] + async fn method_not_allowed() { + let token = token(); + let iter = iproduct!(vec!["", "/.."], vec!["GET", "PUT"]); + for (key, method) in iter { + let status = Request::builder() + .uri(format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}{key}")) + .method(method) + .header("Authorization", format!("Bearer {token}")) + .body(Body::empty()) + .map(request) + .unwrap() + .await + .status(); + assert!(matches!( + status, + StatusCode::BAD_REQUEST | StatusCode::METHOD_NOT_ALLOWED + )); + } + } + + async fn requests_chain( + chain: impl Iterator, + token: impl Fn(&str) -> String, + ) { + let (proxy, _) = proxy().await; + let mut app = app(Arc::new(proxy)).into_service(); + for (uri, method, body, expected_status, compare_body) in chain { + info!(%uri, %method, %body, %expected_status); + let bearer = format!("Bearer {}", token(&uri)); + let request = Request::builder() + .uri(uri) + .method(method) + .header("Authorization", &bearer) + .body(Body::from(body)) + .unwrap(); + let response = ServiceExt::ready(&mut app) + .await + .unwrap() + .call(request) + .await + .unwrap(); + assert_eq!(response.status(), expected_status); + if !compare_body { + continue; + } + let read_body = response.into_body().collect().await.unwrap().to_bytes(); + assert_eq!(body, read_body); + } + } + + #[testlog(tokio::test)] + async fn metrics() { + let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key"); + let req = vec![ + (uri.clone(), "PUT", "body", StatusCode::OK, false), + (uri.clone(), "DELETE", "", StatusCode::OK, false), + ]; + requests_chain(req.into_iter(), |_| token()).await; + + let res = Request::builder() + .uri("/metrics") + .body(Body::empty()) + .map(request) + .unwrap() + .await; + assert_eq!(res.status(), StatusCode::OK); + let body = res.into_body().collect().await.unwrap().to_bytes(); + let body = String::from_utf8_lossy(&body); + tracing::debug!(%body); + // Storage metrics are not gathered for LocalFs + if var(REAL_S3_ENV).is_ok() { + assert!(body.contains("remote_storage_s3_deleted_objects_total")); + } + assert!(body.contains("process_threads")); + } + + #[testlog(tokio::test)] + async fn insert_retrieve_remove() { + let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key"); + let chain = vec![ + (uri.clone(), "GET", "", StatusCode::NOT_FOUND, false), + (uri.clone(), "PUT", "пыщьпыщь", StatusCode::OK, false), + (uri.clone(), "GET", "пыщьпыщь", StatusCode::OK, true), + (uri.clone(), "DELETE", "", StatusCode::OK, false), + (uri, "GET", "", StatusCode::NOT_FOUND, false), + ]; + requests_chain(chain.into_iter(), |_| token()).await; + } + + fn delete_prefix_token(uri: &str) -> String { + use serde::Serialize; + let parts = uri.split("/").collect::>(); + #[derive(Serialize)] + struct PrefixClaims { + tenant_id: TenantId, + timeline_id: Option, + endpoint_id: Option, + exp: u64, + } + let claims = PrefixClaims { + tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(), + timeline_id: parts.get(2).map(|c| c.parse().unwrap()), + endpoint_id: parts.get(3).map(ToString::to_string), + exp: u64::MAX, + }; + let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap(); + let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO); + jsonwebtoken::encode(&header, &claims, &key).unwrap() + } + + // Can't use single digit numbers as they won't be validated as TimelineId and EndpointId + #[testlog(tokio::test)] + async fn delete_prefix() { + let tenant_id = + TenantId::from_array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]).to_string(); + let t2 = TimelineId::from_array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + let t3 = TimelineId::from_array([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + let t4 = TimelineId::from_array([4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + let f = |timeline, path| format!("/{tenant_id}/{timeline}{path}"); + // Why extra slash in string literals? Axum is weird with URIs: + // /1/2 and 1/2/ match different routes, thus first yields OK and second NOT_FOUND + // as it matches /tenant/timeline/endpoint, see https://stackoverflow.com/a/75355932 + // The cost of removing trailing slash is suprisingly hard: + // * Add tower dependency with NormalizePath layer + // * wrap Router<()> in this layer https://github.com/tokio-rs/axum/discussions/2377 + // * Rewrite make_service() -> into_make_service() + // * Rewrite oneshot() (not available for NormalizePath) + // I didn't manage to get it working correctly + let chain = vec![ + // create 1/2/3/4, 1/2/3/5, delete prefix 1/2/3 -> empty + (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), + (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), // we can override file contents + (f(t2, "/3/5"), "PUT", "", StatusCode::OK, false), + (f(t2, "/3"), "DELETE", "", StatusCode::OK, false), + (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t2, "/3/5"), "GET", "", StatusCode::NOT_FOUND, false), + // create 1/2/3/4, 1/2/5/6, delete prefix 1/2/3 -> 1/2/5/6 + (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), + (f(t2, "/5/6"), "PUT", "", StatusCode::OK, false), + (f(t2, "/3"), "DELETE", "", StatusCode::OK, false), + (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t2, "/5/6"), "GET", "", StatusCode::OK, false), + // create 1/2/3/4, 1/2/7/8, delete prefix 1/2 -> empty + (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), + (f(t2, "/7/8"), "PUT", "", StatusCode::OK, false), + (f(t2, ""), "DELETE", "", StatusCode::OK, false), + (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t2, "/7/8"), "GET", "", StatusCode::NOT_FOUND, false), + // create 1/2/3/4, 1/2/5/6, 1/3/8/9, delete prefix 1/2/3 -> 1/2/5/6, 1/3/8/9 + (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), + (f(t2, "/5/6"), "PUT", "", StatusCode::OK, false), + (f(t3, "/8/9"), "PUT", "", StatusCode::OK, false), + (f(t2, "/3"), "DELETE", "", StatusCode::OK, false), + (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t2, "/5/6"), "GET", "", StatusCode::OK, false), + (f(t3, "/8/9"), "GET", "", StatusCode::OK, false), + // create 1/4/5/6, delete prefix 1/2 -> 1/3/8/9, 1/4/5/6 + (f(t4, "/5/6"), "PUT", "", StatusCode::OK, false), + (f(t2, ""), "DELETE", "", StatusCode::OK, false), + (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t3, "/8/9"), "GET", "", StatusCode::OK, false), + (f(t4, "/5/6"), "GET", "", StatusCode::OK, false), + // delete prefix 1 -> empty + (format!("/{tenant_id}"), "DELETE", "", StatusCode::OK, false), + (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t3, "/8/9"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t4, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false), + ]; + requests_chain(chain.into_iter(), delete_prefix_token).await; + } +} diff --git a/object_storage/src/lib.rs b/object_storage/src/lib.rs new file mode 100644 index 0000000000..989afd4c25 --- /dev/null +++ b/object_storage/src/lib.rs @@ -0,0 +1,344 @@ +use anyhow::Result; +use axum::extract::{FromRequestParts, Path}; +use axum::response::{IntoResponse, Response}; +use axum::{RequestPartsExt, http::StatusCode, http::request::Parts}; +use axum_extra::TypedHeader; +use axum_extra::headers::{Authorization, authorization::Bearer}; +use camino::Utf8PathBuf; +use jsonwebtoken::{DecodingKey, Validation}; +use remote_storage::{GenericRemoteStorage, RemotePath}; +use serde::{Deserialize, Serialize}; +use std::fmt::Display; +use std::result::Result as StdResult; +use std::sync::Arc; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error}; +use utils::id::{TenantId, TimelineId}; + +// simplified version of utils::auth::JwtAuth +pub struct JwtAuth { + decoding_key: DecodingKey, + validation: Validation, +} + +pub const VALIDATION_ALGO: jsonwebtoken::Algorithm = jsonwebtoken::Algorithm::EdDSA; +impl JwtAuth { + pub fn new(key: &[u8]) -> Result { + Ok(Self { + decoding_key: DecodingKey::from_ed_pem(key)?, + validation: Validation::new(VALIDATION_ALGO), + }) + } + + pub fn decode(&self, token: &str) -> Result { + Ok(jsonwebtoken::decode(token, &self.decoding_key, &self.validation).map(|t| t.claims)?) + } +} + +fn normalize_key(key: &str) -> StdResult { + let key = clean_utf8(&Utf8PathBuf::from(key)); + if key.starts_with("..") || key == "." || key == "/" { + return Err(format!("invalid key {key}")); + } + match key.strip_prefix("/").map(Utf8PathBuf::from) { + Ok(p) => Ok(p), + _ => Ok(key), + } +} + +// Copied from path_clean crate with PathBuf->Utf8PathBuf +fn clean_utf8(path: &camino::Utf8Path) -> Utf8PathBuf { + use camino::Utf8Component as Comp; + let mut out = Vec::new(); + for comp in path.components() { + match comp { + Comp::CurDir => (), + Comp::ParentDir => match out.last() { + Some(Comp::RootDir) => (), + Some(Comp::Normal(_)) => { + out.pop(); + } + None | Some(Comp::CurDir) | Some(Comp::ParentDir) | Some(Comp::Prefix(_)) => { + out.push(comp) + } + }, + comp => out.push(comp), + } + } + if !out.is_empty() { + out.iter().collect() + } else { + Utf8PathBuf::from(".") + } +} + +pub struct Storage { + pub auth: JwtAuth, + pub storage: GenericRemoteStorage, + pub cancel: CancellationToken, + pub max_upload_file_limit: usize, +} + +pub type EndpointId = String; // If needed, reuse small string from proxy/src/types.rc + +#[derive(Deserialize, Serialize, PartialEq)] +pub struct Claims { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub endpoint_id: EndpointId, + pub exp: u64, +} + +impl Display for Claims { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "Claims(tenant_id {} timeline_id {} endpoint_id {} exp {})", + self.tenant_id, self.timeline_id, self.endpoint_id, self.exp + ) + } +} + +#[derive(Deserialize, Serialize)] +struct KeyRequest { + tenant_id: TenantId, + timeline_id: TimelineId, + endpoint_id: EndpointId, + path: String, +} + +#[derive(Debug, PartialEq)] +pub struct S3Path { + pub path: RemotePath, +} + +impl TryFrom<&KeyRequest> for S3Path { + type Error = String; + fn try_from(req: &KeyRequest) -> StdResult { + let KeyRequest { + tenant_id, + timeline_id, + endpoint_id, + path, + } = &req; + let prefix = format!("{tenant_id}/{timeline_id}/{endpoint_id}",); + let path = Utf8PathBuf::from(prefix).join(normalize_key(path)?); + let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative + Ok(S3Path { path }) + } +} + +fn unauthorized(route: impl Display, claims: impl Display) -> Response { + debug!(%route, %claims, "route doesn't match claims"); + StatusCode::UNAUTHORIZED.into_response() +} + +pub fn bad_request(err: impl Display, desc: &'static str) -> Response { + debug!(%err, desc); + (StatusCode::BAD_REQUEST, err.to_string()).into_response() +} + +pub fn ok() -> Response { + StatusCode::OK.into_response() +} + +pub fn internal_error(err: impl Display, path: impl Display, desc: &'static str) -> Response { + error!(%err, %path, desc); + StatusCode::INTERNAL_SERVER_ERROR.into_response() +} + +pub fn not_found(key: impl ToString) -> Response { + (StatusCode::NOT_FOUND, key.to_string()).into_response() +} + +impl FromRequestParts> for S3Path { + type Rejection = Response; + async fn from_request_parts( + parts: &mut Parts, + state: &Arc, + ) -> Result { + let Path(path): Path = parts + .extract() + .await + .map_err(|e| bad_request(e, "invalid route"))?; + let TypedHeader(Authorization(bearer)) = parts + .extract::>>() + .await + .map_err(|e| bad_request(e, "invalid token"))?; + let claims: Claims = state + .auth + .decode(bearer.token()) + .map_err(|e| bad_request(e, "decoding token"))?; + let route = Claims { + tenant_id: path.tenant_id, + timeline_id: path.timeline_id, + endpoint_id: path.endpoint_id.clone(), + exp: claims.exp, + }; + if route != claims { + return Err(unauthorized(route, claims)); + } + (&path) + .try_into() + .map_err(|e| bad_request(e, "invalid route")) + } +} + +#[derive(Deserialize, Serialize, PartialEq)] +pub struct PrefixKeyPath { + pub tenant_id: TenantId, + pub timeline_id: Option, + pub endpoint_id: Option, +} + +impl Display for PrefixKeyPath { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "PrefixKeyPath(tenant_id {} timeline_id {} endpoint_id {})", + self.tenant_id, + self.timeline_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()), + self.endpoint_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()) + ) + } +} + +#[derive(Debug, PartialEq)] +pub struct PrefixS3Path { + pub path: RemotePath, +} + +impl From<&PrefixKeyPath> for PrefixS3Path { + fn from(path: &PrefixKeyPath) -> Self { + let timeline_id = path + .timeline_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()); + let endpoint_id = path + .endpoint_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()); + let path = Utf8PathBuf::from(path.tenant_id.to_string()) + .join(timeline_id) + .join(endpoint_id); + let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative + PrefixS3Path { path } + } +} + +impl FromRequestParts> for PrefixS3Path { + type Rejection = Response; + async fn from_request_parts( + parts: &mut Parts, + state: &Arc, + ) -> Result { + let Path(path) = parts + .extract::>() + .await + .map_err(|e| bad_request(e, "invalid route"))?; + let TypedHeader(Authorization(bearer)) = parts + .extract::>>() + .await + .map_err(|e| bad_request(e, "invalid token"))?; + let claims: PrefixKeyPath = state + .auth + .decode(bearer.token()) + .map_err(|e| bad_request(e, "invalid token"))?; + if path != claims { + return Err(unauthorized(path, claims)); + } + Ok((&path).into()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn normalize_key() { + let f = super::normalize_key; + assert_eq!(f("hello/world/..").unwrap(), Utf8PathBuf::from("hello")); + assert_eq!( + f("ololo/1/../../not_ololo").unwrap(), + Utf8PathBuf::from("not_ololo") + ); + assert!(f("ololo/1/../../../").is_err()); + assert!(f(".").is_err()); + assert!(f("../").is_err()); + assert!(f("").is_err()); + assert_eq!(f("/1/2/3").unwrap(), Utf8PathBuf::from("1/2/3")); + assert!(f("/1/2/3/../../../").is_err()); + assert!(f("/1/2/3/../../../../").is_err()); + } + + const TENANT_ID: TenantId = + TenantId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]); + const TIMELINE_ID: TimelineId = + TimelineId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]); + const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg"; + + #[test] + fn s3_path() { + let auth = Claims { + tenant_id: TENANT_ID, + timeline_id: TIMELINE_ID, + endpoint_id: ENDPOINT_ID.into(), + exp: u64::MAX, + }; + let s3_path = |key| { + let path = &format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/{key}"); + let path = RemotePath::from_string(path).unwrap(); + S3Path { path } + }; + + let path = "cache_key".to_string(); + let mut key_path = KeyRequest { + path, + tenant_id: auth.tenant_id, + timeline_id: auth.timeline_id, + endpoint_id: auth.endpoint_id, + }; + assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path)); + + key_path.path = "we/can/have/nested/paths".to_string(); + assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path)); + + key_path.path = "../error/hello/../".to_string(); + assert!(S3Path::try_from(&key_path).is_err()); + } + + #[test] + fn prefix_s3_path() { + let mut path = PrefixKeyPath { + tenant_id: TENANT_ID, + timeline_id: None, + endpoint_id: None, + }; + let prefix_path = |s: String| RemotePath::from_string(&s).unwrap(); + assert_eq!( + PrefixS3Path::from(&path).path, + prefix_path(format!("{TENANT_ID}")) + ); + + path.timeline_id = Some(TIMELINE_ID); + assert_eq!( + PrefixS3Path::from(&path).path, + prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}")) + ); + + path.endpoint_id = Some(ENDPOINT_ID.into()); + assert_eq!( + PrefixS3Path::from(&path).path, + prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}")) + ); + } +} diff --git a/object_storage/src/main.rs b/object_storage/src/main.rs new file mode 100644 index 0000000000..40325db19d --- /dev/null +++ b/object_storage/src/main.rs @@ -0,0 +1,65 @@ +//! `object_storage` is a service which provides API for uploading and downloading +//! files. It is used by compute and control plane for accessing LFC prewarm data. +//! This service is deployed either as a separate component or as part of compute image +//! for large computes. +mod app; +use anyhow::Context; +use tracing::info; +use utils::logging; + +//see set() +const fn max_upload_file_limit() -> usize { + 100 * 1024 * 1024 +} + +#[derive(serde::Deserialize)] +#[serde(tag = "type")] +struct Config { + listen: std::net::SocketAddr, + pemfile: camino::Utf8PathBuf, + #[serde(flatten)] + storage_config: remote_storage::RemoteStorageConfig, + #[serde(default = "max_upload_file_limit")] + max_upload_file_limit: usize, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + logging::init( + logging::LogFormat::Plain, + logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + )?; + + let config: String = std::env::args().skip(1).take(1).collect(); + if config.is_empty() { + anyhow::bail!("Usage: object_storage config.json") + } + info!("Reading config from {config}"); + let config = std::fs::read_to_string(config.clone())?; + let config: Config = serde_json::from_str(&config).context("parsing config")?; + info!("Reading pemfile from {}", config.pemfile.clone()); + let pemfile = std::fs::read(config.pemfile.clone())?; + info!("Loading public key from {}", config.pemfile.clone()); + let auth = object_storage::JwtAuth::new(&pemfile)?; + + let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap(); + info!("listening on {}", listener.local_addr().unwrap()); + + let storage = remote_storage::GenericRemoteStorage::from_config(&config.storage_config).await?; + let cancel = tokio_util::sync::CancellationToken::new(); + app::check_storage_permissions(&storage, cancel.clone()).await?; + + let proxy = std::sync::Arc::new(object_storage::Storage { + auth, + storage, + cancel: cancel.clone(), + max_upload_file_limit: config.max_upload_file_limit, + }); + + tokio::spawn(utils::signals::signal_handler(cancel.clone())); + axum::serve(listener, app::app(proxy)) + .with_graceful_shutdown(async move { cancel.cancelled().await }) + .await?; + Ok(()) +} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 56d97bf8a9..74f3fce6e5 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -10,6 +10,8 @@ default = [] # which adds some runtime cost to run tests on outage conditions testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"] +fuzz-read-path = ["testing"] + [dependencies] anyhow.workspace = true arc-swap.workspace = true diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 000938b189..3108b5351f 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -126,7 +126,7 @@ async fn ingest( max_concurrency: NonZeroUsize::new(1).unwrap(), }); let (_desc, path) = layer - .write_to_disk(&ctx, None, l0_flush_state.inner()) + .write_to_disk(&ctx, None, l0_flush_state.inner(), &gate, cancel.clone()) .await? .unwrap(); tokio::fs::remove_file(path).await?; diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 77b3f90b3e..215682d90c 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -65,7 +65,7 @@ use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; use once_cell::sync::Lazy; use pageserver::config::PageServerConf; -use pageserver::walredo::PostgresRedoManager; +use pageserver::walredo::{PostgresRedoManager, RedoAttemptType}; use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; @@ -223,7 +223,14 @@ impl Request { // TODO: avoid these clones manager - .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version) + .request_redo( + *key, + *lsn, + base_img.clone(), + records.clone(), + *pg_version, + RedoAttemptType::ReadPage, + ) .await .context("request_redo") } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 224208034b..e0cd19817d 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -86,17 +86,17 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } - /// Get an arbitrary path and returning a streaming Response. This function is suitable - /// for pass-through/proxy use cases where we don't care what the response content looks - /// like. + /// Send an HTTP request to an arbitrary path with a desired HTTP method and returning a streaming + /// Response. This function is suitable for pass-through/proxy use cases where we don't care + /// what the response content looks like. /// /// Use/add one of the properly typed methods below if you know aren't proxying, and /// know what kind of response you expect. - pub async fn get_raw(&self, path: String) -> Result { + pub async fn op_raw(&self, method: Method, path: String) -> Result { debug_assert!(path.starts_with('/')); let uri = format!("{}{}", self.mgmt_api_endpoint, path); - let mut req = self.client.request(Method::GET, uri); + let mut req = self.client.request(method, uri); if let Some(value) = &self.authorization_header { req = req.header(reqwest::header::AUTHORIZATION, value); } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index de527e307b..3510ccb529 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -34,7 +34,7 @@ use utils::lsn::Lsn; use crate::context::RequestContext; use crate::pgdatadir_mapping::Version; use crate::tenant::storage_layer::IoConcurrency; -use crate::tenant::timeline::GetVectoredError; +use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery}; use crate::tenant::{PageReconstructError, Timeline}; #[derive(Debug, thiserror::Error)] @@ -353,9 +353,10 @@ where let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar); for part in slru_partitions.parts { + let query = VersionedKeySpaceQuery::uniform(part, self.lsn); let blocks = self .timeline - .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx) + .get_vectored(query, self.io_concurrency.clone(), self.ctx) .await?; for (key, block) in blocks { diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 4cfc0c24f8..250d4180f5 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -16,7 +16,7 @@ use http_utils::tls_certs::ReloadingCertificateResolver; use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric}; use metrics::set_build_info_metric; use nix::sys::socket::{setsockopt, sockopt}; -use pageserver::config::{PageServerConf, PageserverIdentity}; +use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields}; use pageserver::controller_upcall_client::StorageControllerUpcallClient; use pageserver::deletion_queue::DeletionQueue; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; @@ -31,10 +31,10 @@ use pageserver::{ }; use postgres_backend::AuthType; use remote_storage::GenericRemoteStorage; -use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::*; +use tracing_utils::OtelGuard; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::crashsafe::syncfs; use utils::logging::TracingErrorLayerEnablement; @@ -97,7 +97,7 @@ fn main() -> anyhow::Result<()> { env::set_current_dir(&workdir) .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?; - let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?; + let (conf, ignored) = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?; // Initialize logging. // @@ -118,6 +118,21 @@ fn main() -> anyhow::Result<()> { logging::Output::Stdout, )?; + let otel_enablement = match &conf.tracing { + Some(cfg) => tracing_utils::OtelEnablement::Enabled { + service_name: "pageserver".to_string(), + export_config: (&cfg.export_config).into(), + runtime: *COMPUTE_REQUEST_RUNTIME, + }, + None => tracing_utils::OtelEnablement::Disabled, + }; + + let otel_guard = tracing_utils::init_performance_tracing(otel_enablement); + + if otel_guard.is_some() { + info!(?conf.tracing, "starting with OTEL tracing enabled"); + } + // mind the order required here: 1. logging, 2. panic_hook, 3. sentry. // disarming this hook on pageserver, because we never tear down tracing. logging::replace_panic_hook_with_tracing_panic_hook().forget(); @@ -128,7 +143,17 @@ fn main() -> anyhow::Result<()> { &[("node_id", &conf.id.to_string())], ); - // after setting up logging, log the effective IO engine choice and read path implementations + // Warn about ignored config items; see pageserver_api::config::ConfigToml + // doc comment for rationale why we prefer this over serde(deny_unknown_fields). + { + let ignored_fields::Paths { paths } = &ignored; + for path in paths { + warn!(?path, "ignoring unknown configuration item"); + } + } + + // Log configuration items for feature-flag-like config + // (maybe we should automate this with a visitor?). info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol"); @@ -191,7 +216,7 @@ fn main() -> anyhow::Result<()> { tracing::info!("Initializing page_cache..."); page_cache::init(conf.page_cache_size); - start_pageserver(launch_ts, conf).context("Failed to start pageserver")?; + start_pageserver(launch_ts, conf, ignored, otel_guard).context("Failed to start pageserver")?; scenario.teardown(); Ok(()) @@ -201,7 +226,7 @@ fn initialize_config( identity_file_path: &Utf8Path, cfg_file_path: &Utf8Path, workdir: &Utf8Path, -) -> anyhow::Result<&'static PageServerConf> { +) -> anyhow::Result<(&'static PageServerConf, ignored_fields::Paths)> { // The deployment orchestrator writes out an indentity file containing the node id // for all pageservers. This file is the source of truth for the node id. In order // to allow for rolling back pageserver releases, the node id is also included in @@ -230,16 +255,36 @@ fn initialize_config( let config_file_contents = std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?; - let config_toml = serde_path_to_error::deserialize( - toml_edit::de::Deserializer::from_str(&config_file_contents) - .context("build toml deserializer")?, - ) - .context("deserialize config toml")?; + // Deserialize the config file contents into a ConfigToml. + let config_toml: pageserver_api::config::ConfigToml = { + let deserializer = toml_edit::de::Deserializer::from_str(&config_file_contents) + .context("build toml deserializer")?; + let mut path_to_error_track = serde_path_to_error::Track::new(); + let deserializer = + serde_path_to_error::Deserializer::new(deserializer, &mut path_to_error_track); + serde::Deserialize::deserialize(deserializer).context("deserialize config toml")? + }; + + // Find unknown fields by re-serializing the parsed ConfigToml and comparing it to the on-disk file. + // Any fields that are only in the on-disk version are unknown. + // (The assumption here is that the ConfigToml doesn't to skip_serializing_if.) + // (Make sure to read the ConfigToml doc comment on why we only want to warn about, but not fail startup, on unknown fields). + let ignored = { + let ondisk_toml = config_file_contents + .parse::() + .context("parse original config as toml document")?; + let parsed_toml = toml_edit::ser::to_document(&config_toml) + .context("re-serialize config to toml document")?; + pageserver::config::ignored_fields::find(ondisk_toml, parsed_toml) + }; + + // Construct the runtime god object (it's called PageServerConf but actually is just global shared state). let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir) .context("runtime-validation of config toml")?; + let conf = Box::leak(Box::new(conf)); - Ok(Box::leak(Box::new(conf))) + Ok((conf, ignored)) } struct WaitForPhaseResult { @@ -290,6 +335,8 @@ fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) { fn start_pageserver( launch_ts: &'static LaunchTimestamp, conf: &'static PageServerConf, + ignored: ignored_fields::Paths, + otel_guard: Option, ) -> anyhow::Result<()> { // Monotonic time for later calculating startup duration let started_startup_at = Instant::now(); @@ -312,7 +359,7 @@ fn start_pageserver( pageserver::metrics::tokio_epoll_uring::Collector::new(), )) .unwrap(); - pageserver::preinitialize_metrics(conf); + pageserver::preinitialize_metrics(conf, ignored); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes @@ -405,6 +452,24 @@ fn start_pageserver( info!("Using auth for http API: {:#?}", conf.http_auth_type); info!("Using auth for pg connections: {:#?}", conf.pg_auth_type); + let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api + { + let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new( + "main", + &conf.ssl_key_file, + &conf.ssl_cert_file, + conf.ssl_cert_reload_period, + ))?; + + let server_config = rustls::ServerConfig::builder() + .with_no_client_auth() + .with_cert_resolver(resolver); + + Some(Arc::new(server_config)) + } else { + None + }; + match var("NEON_AUTH_TOKEN") { Ok(v) => { info!("Loaded JWT token for authentication with Safekeeper"); @@ -623,17 +688,11 @@ fn start_pageserver( let https_task = match https_listener { Some(https_listener) => { - let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new( - &conf.ssl_key_file, - &conf.ssl_cert_file, - conf.ssl_cert_reload_period, - ))?; + let tls_server_config = tls_server_config + .clone() + .expect("tls_server_config is set earlier if https is enabled"); - let server_config = rustls::ServerConfig::builder() - .with_no_client_auth() - .with_cert_resolver(resolver); - - let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config)); + let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config); let server = http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?; @@ -675,45 +734,33 @@ fn start_pageserver( // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. We created the listener earlier already. - let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, { - let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it - pageserver_listener - .set_nonblocking(true) - .context("set listener to nonblocking")?; - tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")? - }); + let perf_trace_dispatch = otel_guard.as_ref().map(|g| g.dispatch.clone()); + let page_service = page_service::spawn( + conf, + tenant_manager.clone(), + pg_auth, + perf_trace_dispatch, + { + let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it + pageserver_listener + .set_nonblocking(true) + .context("set listener to nonblocking")?; + tokio::net::TcpListener::from_std(pageserver_listener) + .context("create tokio listener")? + }, + if conf.enable_tls_page_service_api { + tls_server_config + } else { + None + }, + ); // All started up! Now just sit and wait for shutdown signal. BACKGROUND_RUNTIME.block_on(async move { let signal_token = CancellationToken::new(); let signal_cancel = signal_token.child_token(); - // Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals - // even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See: - // https://github.com/neondatabase/neon/issues/9740. - tokio::spawn(async move { - let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap(); - let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); - let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap(); - - loop { - let signal = tokio::select! { - _ = sigquit.recv() => { - info!("Got signal SIGQUIT. Terminating in immediate shutdown mode."); - std::process::exit(111); - } - _ = sigint.recv() => "SIGINT", - _ = sigterm.recv() => "SIGTERM", - }; - - if !signal_token.is_cancelled() { - info!("Got signal {signal}. Terminating gracefully in fast shutdown mode."); - signal_token.cancel(); - } else { - info!("Got signal {signal}. Already shutting down."); - } - } - }); + tokio::spawn(utils::signals::signal_handler(signal_token)); // Wait for cancellation signal and shut down the pageserver. // diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index c336f22f8e..26ae6af70e 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -4,6 +4,8 @@ //! file, or on the command line. //! See also `settings.md` for better description on every parameter. +pub mod ignored_fields; + use std::env; use std::num::NonZeroUsize; use std::sync::Arc; @@ -215,6 +217,13 @@ pub struct PageServerConf { /// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline. pub generate_unarchival_heatmap: bool, + + pub tracing: Option, + + /// Enable TLS in page service API. + /// Does not force TLS: the client negotiates TLS usage during the handshake. + /// Uses key and certificate from ssl_key_file/ssl_cert_file. + pub enable_tls_page_service_api: bool, } /// Token for authentication to safekeepers @@ -386,6 +395,8 @@ impl PageServerConf { validate_wal_contiguity, load_previous_heatmap, generate_unarchival_heatmap, + tracing, + enable_tls_page_service_api, } = config_toml; let mut conf = PageServerConf { @@ -435,6 +446,8 @@ impl PageServerConf { wal_receiver_protocol, page_service_pipelining, get_vectored_concurrent_io, + tracing, + enable_tls_page_service_api, // ------------------------------------------------------------ // fields that require additional validation or custom handling @@ -506,6 +519,17 @@ impl PageServerConf { ); } + if let Some(tracing_config) = conf.tracing.as_ref() { + let ratio = &tracing_config.sampling_ratio; + ensure!( + ratio.denominator != 0 && ratio.denominator >= ratio.numerator, + format!( + "Invalid sampling ratio: {}/{}", + ratio.numerator, ratio.denominator + ) + ); + } + IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance) .map_err(anyhow::Error::msg) .with_context(|| { @@ -545,7 +569,6 @@ impl PageServerConf { } #[derive(serde::Deserialize, serde::Serialize)] -#[serde(deny_unknown_fields)] pub struct PageserverIdentity { pub id: NodeId, } @@ -617,82 +640,4 @@ mod tests { PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) .expect("parse_and_validate"); } - - /// If there's a typo in the pageserver config, we'd rather catch that typo - /// and fail pageserver startup than silently ignoring the typo, leaving whoever - /// made it in the believe that their config change is effective. - /// - /// The default in serde is to allow unknown fields, so, we rely - /// on developer+review discipline to add `deny_unknown_fields` when adding - /// new structs to the config, and these tests here as a regression test. - /// - /// The alternative to all of this would be to allow unknown fields in the config. - /// To catch them, we could have a config check tool or mgmt API endpoint that - /// compares the effective config with the TOML on disk and makes sure that - /// the on-disk TOML is a strict subset of the effective config. - mod unknown_fields_handling { - macro_rules! test { - ($short_name:ident, $input:expr) => { - #[test] - fn $short_name() { - let input = $input; - let err = toml_edit::de::from_str::(&input) - .expect_err("some_invalid_field is an invalid field"); - dbg!(&err); - assert!(err.to_string().contains("some_invalid_field")); - } - }; - } - use indoc::indoc; - - test!( - toplevel, - indoc! {r#" - some_invalid_field = 23 - "#} - ); - - test!( - toplevel_nested, - indoc! {r#" - [some_invalid_field] - foo = 23 - "#} - ); - - test!( - disk_usage_based_eviction, - indoc! {r#" - [disk_usage_based_eviction] - some_invalid_field = 23 - "#} - ); - - test!( - tenant_config, - indoc! {r#" - [tenant_config] - some_invalid_field = 23 - "#} - ); - - test!( - l0_flush, - indoc! {r#" - [l0_flush] - mode = "direct" - some_invalid_field = 23 - "#} - ); - - // TODO: fix this => https://github.com/neondatabase/neon/issues/8915 - // test!( - // remote_storage_config, - // indoc! {r#" - // [remote_storage_config] - // local_path = "/nonexistent" - // some_invalid_field = 23 - // "#} - // ); - } } diff --git a/pageserver/src/config/ignored_fields.rs b/pageserver/src/config/ignored_fields.rs new file mode 100644 index 0000000000..68d0823604 --- /dev/null +++ b/pageserver/src/config/ignored_fields.rs @@ -0,0 +1,179 @@ +//! Check for fields in the on-disk config file that were ignored when +//! deserializing [`pageserver_api::config::ConfigToml`]. +//! +//! This could have been part of the [`pageserver_api::config`] module, +//! but the way we identify unused fields in this module +//! is specific to the format (TOML) and the implementation of the +//! deserialization for that format ([`toml_edit`]). + +use std::collections::HashSet; + +use itertools::Itertools; + +/// Pass in the user-specified config and the re-serialized [`pageserver_api::config::ConfigToml`]. +/// The returned [`Paths`] contains the paths to the fields that were ignored by deserialization +/// of the [`pageserver_api::config::ConfigToml`]. +pub fn find(user_specified: toml_edit::DocumentMut, reserialized: toml_edit::DocumentMut) -> Paths { + let user_specified = paths(user_specified); + let reserialized = paths(reserialized); + fn paths(doc: toml_edit::DocumentMut) -> HashSet { + let mut out = Vec::new(); + let mut visitor = PathsVisitor::new(&mut out); + visitor.visit_table_like(doc.as_table()); + HashSet::from_iter(out) + } + + let mut ignored = HashSet::new(); + + // O(n) because of HashSet + for path in user_specified { + if !reserialized.contains(&path) { + ignored.insert(path); + } + } + + Paths { + paths: ignored + .into_iter() + // sort lexicographically for deterministic output + .sorted() + .collect(), + } +} + +pub struct Paths { + pub paths: Vec, +} + +struct PathsVisitor<'a> { + stack: Vec, + out: &'a mut Vec, +} + +impl<'a> PathsVisitor<'a> { + fn new(out: &'a mut Vec) -> Self { + Self { + stack: Vec::new(), + out, + } + } + + fn visit_table_like(&mut self, table_like: &dyn toml_edit::TableLike) { + for (entry, item) in table_like.iter() { + self.stack.push(entry.to_string()); + self.visit_item(item); + self.stack.pop(); + } + } + + fn visit_item(&mut self, item: &toml_edit::Item) { + match item { + toml_edit::Item::None => (), + toml_edit::Item::Value(value) => self.visit_value(value), + toml_edit::Item::Table(table) => { + self.visit_table_like(table); + } + toml_edit::Item::ArrayOfTables(array_of_tables) => { + for (i, table) in array_of_tables.iter().enumerate() { + self.stack.push(format!("[{i}]")); + self.visit_table_like(table); + self.stack.pop(); + } + } + } + } + + fn visit_value(&mut self, value: &toml_edit::Value) { + match value { + toml_edit::Value::String(_) + | toml_edit::Value::Integer(_) + | toml_edit::Value::Float(_) + | toml_edit::Value::Boolean(_) + | toml_edit::Value::Datetime(_) => self.out.push(self.stack.join(".")), + toml_edit::Value::Array(array) => { + for (i, value) in array.iter().enumerate() { + self.stack.push(format!("[{i}]")); + self.visit_value(value); + self.stack.pop(); + } + } + toml_edit::Value::InlineTable(inline_table) => self.visit_table_like(inline_table), + } + } +} + +#[cfg(test)] +pub(crate) mod tests { + + fn test_impl(original: &str, parsed: &str, expect: [&str; 1]) { + let original: toml_edit::DocumentMut = original.parse().expect("parse original config"); + let parsed: toml_edit::DocumentMut = parsed.parse().expect("parse re-serialized config"); + + let super::Paths { paths: actual } = super::find(original, parsed); + assert_eq!(actual, &expect); + } + + #[test] + fn top_level() { + test_impl( + r#" + [a] + b = 1 + c = 2 + d = 3 + "#, + r#" + [a] + b = 1 + c = 2 + "#, + ["a.d"], + ); + } + + #[test] + fn nested() { + test_impl( + r#" + [a.b.c] + d = 23 + "#, + r#" + [a] + e = 42 + "#, + ["a.b.c.d"], + ); + } + + #[test] + fn array_of_tables() { + test_impl( + r#" + [[a]] + b = 1 + c = 2 + d = 3 + "#, + r#" + [[a]] + b = 1 + c = 2 + "#, + ["a.[0].d"], + ); + } + + #[test] + fn array() { + test_impl( + r#" + foo = [ {bar = 23} ] + "#, + r#" + foo = [ { blup = 42 }] + "#, + ["foo.[0].bar"], + ); + } +} diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index d2caf030df..04dcca4299 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -89,7 +89,7 @@ //! [`RequestContext`] argument. Functions in the middle of the call chain //! only need to pass it on. -use std::sync::Arc; +use std::{sync::Arc, time::Duration}; use once_cell::sync::Lazy; use tracing::warn; @@ -100,6 +100,12 @@ use crate::{ task_mgr::TaskKind, tenant::Timeline, }; +use futures::FutureExt; +use futures::future::BoxFuture; +use std::future::Future; +use tracing_utils::perf_span::{PerfInstrument, PerfSpan}; + +use tracing::{Dispatch, Span}; // The main structure of this module, see module-level comment. pub struct RequestContext { @@ -109,6 +115,8 @@ pub struct RequestContext { page_content_kind: PageContentKind, read_path_debug: bool, scope: Scope, + perf_span: Option, + perf_span_dispatch: Option, } #[derive(Clone)] @@ -263,22 +271,15 @@ impl RequestContextBuilder { page_content_kind: PageContentKind::Unknown, read_path_debug: false, scope: Scope::new_global(), + perf_span: None, + perf_span_dispatch: None, }, } } - pub fn extend(original: &RequestContext) -> Self { + pub fn from(original: &RequestContext) -> Self { Self { - // This is like a Copy, but avoid implementing Copy because ordinary users of - // RequestContext should always move or ref it. - inner: RequestContext { - task_kind: original.task_kind, - download_behavior: original.download_behavior, - access_stats_behavior: original.access_stats_behavior, - page_content_kind: original.page_content_kind, - read_path_debug: original.read_path_debug, - scope: original.scope.clone(), - }, + inner: original.clone(), } } @@ -316,12 +317,74 @@ impl RequestContextBuilder { self } - pub fn build(self) -> RequestContext { + pub(crate) fn perf_span_dispatch(mut self, dispatch: Option) -> Self { + self.inner.perf_span_dispatch = dispatch; + self + } + + pub fn root_perf_span(mut self, make_span: Fn) -> Self + where + Fn: FnOnce() -> Span, + { + assert!(self.inner.perf_span.is_none()); + assert!(self.inner.perf_span_dispatch.is_some()); + + let dispatcher = self.inner.perf_span_dispatch.as_ref().unwrap(); + let new_span = tracing::dispatcher::with_default(dispatcher, make_span); + + self.inner.perf_span = Some(PerfSpan::new(new_span, dispatcher.clone())); + + self + } + + pub fn perf_span(mut self, make_span: Fn) -> Self + where + Fn: FnOnce(&Span) -> Span, + { + if let Some(ref perf_span) = self.inner.perf_span { + assert!(self.inner.perf_span_dispatch.is_some()); + let dispatcher = self.inner.perf_span_dispatch.as_ref().unwrap(); + + let new_span = + tracing::dispatcher::with_default(dispatcher, || make_span(perf_span.inner())); + + self.inner.perf_span = Some(PerfSpan::new(new_span, dispatcher.clone())); + } + + self + } + + pub fn root(self) -> RequestContext { + self.inner + } + + pub fn attached_child(self) -> RequestContext { + self.inner + } + + pub fn detached_child(self) -> RequestContext { self.inner } } impl RequestContext { + /// Private clone implementation + /// + /// Callers should use the [`RequestContextBuilder`] or child spaning APIs of + /// [`RequestContext`]. + fn clone(&self) -> Self { + Self { + task_kind: self.task_kind, + download_behavior: self.download_behavior, + access_stats_behavior: self.access_stats_behavior, + page_content_kind: self.page_content_kind, + read_path_debug: self.read_path_debug, + scope: self.scope.clone(), + perf_span: self.perf_span.clone(), + perf_span_dispatch: self.perf_span_dispatch.clone(), + } + } + /// Create a new RequestContext that has no parent. /// /// The function is called `new` because, once we add children @@ -337,7 +400,7 @@ impl RequestContext { pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { RequestContextBuilder::new(task_kind) .download_behavior(download_behavior) - .build() + .root() } /// Create a detached child context for a task that may outlive `self`. @@ -358,7 +421,10 @@ impl RequestContext { /// /// We could make new calls to this function fail if `self` is already canceled. pub fn detached_child(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { - self.child_impl(task_kind, download_behavior) + RequestContextBuilder::from(self) + .task_kind(task_kind) + .download_behavior(download_behavior) + .detached_child() } /// Create a child of context `self` for a task that shall not outlive `self`. @@ -382,7 +448,7 @@ impl RequestContext { /// The method to wait for child tasks would return an error, indicating /// that the child task was not started because the context was canceled. pub fn attached_child(&self) -> Self { - self.child_impl(self.task_kind(), self.download_behavior()) + RequestContextBuilder::from(self).attached_child() } /// Use this function when you should be creating a child context using @@ -397,17 +463,10 @@ impl RequestContext { Self::new(task_kind, download_behavior) } - fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { - RequestContextBuilder::extend(self) - .task_kind(task_kind) - .download_behavior(download_behavior) - .build() - } - pub fn with_scope_timeline(&self, timeline: &Arc) -> Self { - RequestContextBuilder::extend(self) + RequestContextBuilder::from(self) .scope(Scope::new_timeline(timeline)) - .build() + .attached_child() } pub(crate) fn with_scope_page_service_pagestream( @@ -416,9 +475,9 @@ impl RequestContext { crate::page_service::TenantManagerTypes, >, ) -> Self { - RequestContextBuilder::extend(self) + RequestContextBuilder::from(self) .scope(Scope::new_page_service_pagestream(timeline_handle)) - .build() + .attached_child() } pub fn with_scope_secondary_timeline( @@ -426,28 +485,30 @@ impl RequestContext { tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, ) -> Self { - RequestContextBuilder::extend(self) + RequestContextBuilder::from(self) .scope(Scope::new_secondary_timeline(tenant_shard_id, timeline_id)) - .build() + .attached_child() } pub fn with_scope_secondary_tenant(&self, tenant_shard_id: &TenantShardId) -> Self { - RequestContextBuilder::extend(self) + RequestContextBuilder::from(self) .scope(Scope::new_secondary_tenant(tenant_shard_id)) - .build() + .attached_child() } #[cfg(test)] pub fn with_scope_unit_test(&self) -> Self { - RequestContextBuilder::new(TaskKind::UnitTest) + RequestContextBuilder::from(self) + .task_kind(TaskKind::UnitTest) .scope(Scope::new_unit_test()) - .build() + .attached_child() } pub fn with_scope_debug_tools(&self) -> Self { - RequestContextBuilder::new(TaskKind::DebugTool) + RequestContextBuilder::from(self) + .task_kind(TaskKind::DebugTool) .scope(Scope::new_debug_tools()) - .build() + .attached_child() } pub fn task_kind(&self) -> TaskKind { @@ -504,4 +565,76 @@ impl RequestContext { Scope::DebugTools { io_size_metrics } => io_size_metrics, } } + + pub(crate) fn ondemand_download_wait_observe(&self, duration: Duration) { + if duration == Duration::ZERO { + return; + } + + match &self.scope { + Scope::Timeline { arc_arc } => arc_arc + .wait_ondemand_download_time + .observe(self.task_kind, duration), + _ => { + use once_cell::sync::Lazy; + use std::sync::Mutex; + use std::time::Duration; + use utils::rate_limit::RateLimit; + static LIMIT: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1)))); + let mut guard = LIMIT.lock().unwrap(); + guard.call2(|rate_limit_stats| { + warn!( + %rate_limit_stats, + backtrace=%std::backtrace::Backtrace::force_capture(), + "ondemand downloads should always happen within timeline scope", + ); + }); + } + } + } + + pub(crate) fn perf_follows_from(&self, from: &RequestContext) { + if let (Some(span), Some(from_span)) = (&self.perf_span, &from.perf_span) { + span.inner().follows_from(from_span.inner()); + } + } + + pub(crate) fn has_perf_span(&self) -> bool { + self.perf_span.is_some() + } } + +/// [`Future`] extension trait that allow for creating performance +/// spans on sampled requests +pub(crate) trait PerfInstrumentFutureExt<'a>: Future + Send { + /// Instrument this future with a new performance span when the + /// provided request context indicates the originator request + /// was sampled. Otherwise, just box the future and return it as is. + fn maybe_perf_instrument( + self, + ctx: &RequestContext, + make_span: Fn, + ) -> BoxFuture<'a, Self::Output> + where + Self: Sized + 'a, + Fn: FnOnce(&Span) -> Span, + { + match &ctx.perf_span { + Some(perf_span) => { + assert!(ctx.perf_span_dispatch.is_some()); + let dispatcher = ctx.perf_span_dispatch.as_ref().unwrap(); + + let new_span = + tracing::dispatcher::with_default(dispatcher, || make_span(perf_span.inner())); + + let new_perf_span = PerfSpan::new(new_span, dispatcher.clone()); + self.instrument(new_perf_span).boxed() + } + None => self.boxed(), + } + } +} + +// Implement the trait for all types that satisfy the trait bounds +impl<'a, T: Future + Send + 'a> PerfInstrumentFutureExt<'a> for T {} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 8b839b454a..7ea148971f 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -212,6 +212,12 @@ paths: schema: type: string format: date-time + "412": + description: No timestamp is found for given LSN, e.g. if there had been no commits till LSN + content: + application/json: + schema: + $ref: "#/components/schemas/PreconditionFailedError" /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp: parameters: @@ -1133,6 +1139,40 @@ components: applied_gc_cutoff_lsn: type: string format: hex + safekeepers: + $ref: "#/components/schemas/TimelineSafekeepersInfo" + + TimelineSafekeepersInfo: + type: object + required: + - tenant_id + - timeline_id + - generation + - safekeepers + properties: + tenant_id: + type: string + format: hex + timeline_id: + type: string + format: hex + generation: + type: integer + safekeepers: + type: array + items: + $ref: "#/components/schemas/TimelineSafekeeperInfo" + + TimelineSafekeeperInfo: + type: object + required: + - id + - hostname + properties: + id: + type: integer + hostname: + type: string SyntheticSizeResponse: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 5a13fb1387..bbc4bfae1b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -67,15 +67,15 @@ use crate::tenant::mgr::{ }; use crate::tenant::remote_timeline_client::index::GcCompactionState; use crate::tenant::remote_timeline_client::{ - download_index_part, list_remote_tenant_shards, list_remote_timelines, + download_index_part, download_tenant_manifest, list_remote_tenant_shards, list_remote_timelines, }; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName}; use crate::tenant::timeline::offload::{OffloadError, offload_timeline}; use crate::tenant::timeline::{ - CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout, - WaitLsnWaiter, import_pgdata, + CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline, + WaitLsnTimeout, WaitLsnWaiter, import_pgdata, }; use crate::tenant::{ GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError, @@ -445,6 +445,9 @@ async fn build_timeline_info_common( let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats(); + // Externally, expose the lowest LSN that can be used to create a branch. + // Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we + // actually trimmed data to), which can pass each other when PITR is changed. let min_readable_lsn = std::cmp::max( timeline.get_gc_cutoff_lsn(), *timeline.get_applied_gc_cutoff_lsn(), @@ -461,7 +464,6 @@ async fn build_timeline_info_common( initdb_lsn, last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), - _unused: Default::default(), // Unused, for legacy decode only min_readable_lsn, applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(), current_logical_size: current_logical_size.size_dont_care_about_accuracy(), @@ -987,7 +989,7 @@ async fn get_lsn_by_timestamp_handler( if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( - "Size calculations are only available on shard zero" + "Lsn calculations by timestamp are only available on shard zero" ))); } @@ -1062,7 +1064,7 @@ async fn get_timestamp_of_lsn_handler( if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( - "Size calculations are only available on shard zero" + "Timestamp calculations by lsn are only available on shard zero" ))); } @@ -1088,8 +1090,8 @@ async fn get_timestamp_of_lsn_handler( .to_string(); json_response(StatusCode::OK, time) } - None => Err(ApiError::NotFound( - anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(), + None => Err(ApiError::PreconditionFailed( + format!("Timestamp for lsn {} not found", lsn).into(), )), } } @@ -2256,7 +2258,6 @@ async fn timeline_compact_handler( let state = get_state(&request); let mut flags = EnumSet::empty(); - flags |= CompactFlags::NoYield; // run compaction to completion if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { flags |= CompactFlags::ForceL0Compaction; @@ -2273,6 +2274,7 @@ async fn timeline_compact_handler( if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? { flags |= CompactFlags::DryRun; } + // Manual compaction does not yield for L0. let wait_until_uploaded = parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); @@ -2336,21 +2338,31 @@ async fn timeline_compact_handler( } async fn timeline_mark_invisible_handler( - request: Request, + mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let compact_request = json_request_maybe::>(&mut request).await?; + let state = get_state(&request); + let visibility = match compact_request { + Some(req) => match req.is_visible { + Some(true) => TimelineVisibilityState::Visible, + Some(false) | None => TimelineVisibilityState::Invisible, + }, + None => TimelineVisibilityState::Invisible, + }; + async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; let timeline = tenant.get_timeline(timeline_id, true)?; - timeline.remote_client.schedule_index_upload_for_timeline_invisible_state(TimelineVisibilityState::Invisible).map_err(ApiError::InternalServerError)?; + timeline.remote_client.schedule_index_upload_for_timeline_invisible_state(visibility).map_err(ApiError::InternalServerError)?; json_response(StatusCode::OK, ()) } .instrument(info_span!("manual_timeline_mark_invisible", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) @@ -2417,7 +2429,6 @@ async fn timeline_checkpoint_handler( let state = get_state(&request); let mut flags = EnumSet::empty(); - flags |= CompactFlags::NoYield; // run compaction to completion if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { flags |= CompactFlags::ForceL0Compaction; } @@ -2687,11 +2698,12 @@ async fn getpage_at_lsn_handler_inner( let lsn: Option = parse_query_param(&request, "lsn")?; async { - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - // Enable read path debugging let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; - let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true) - .scope(context::Scope::new_timeline(&timeline)).build(); + let ctx = RequestContextBuilder::new(TaskKind::MgmtRequest) + .download_behavior(DownloadBehavior::Download) + .scope(context::Scope::new_timeline(&timeline)) + .read_path_debug(true) + .root(); // Use last_record_lsn if no lsn is provided let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); @@ -2900,9 +2912,22 @@ async fn tenant_scan_remote_handler( }; } + let result = + download_tenant_manifest(&state.remote_storage, &tenant_shard_id, generation, &cancel) + .instrument(info_span!("download_tenant_manifest", + tenant_id=%tenant_shard_id.tenant_id, + shard_id=%tenant_shard_id.shard_slug())) + .await; + let stripe_size = match result { + Ok((manifest, _, _)) => manifest.stripe_size, + Err(DownloadError::NotFound) => None, + Err(err) => return Err(ApiError::InternalServerError(anyhow!(err))), + }; + response.shards.push(TenantScanRemoteStorageShard { tenant_shard_id, generation: generation.into(), + stripe_size, }); } @@ -3178,7 +3203,8 @@ async fn list_aux_files( timeline.gate.enter().map_err(|_| ApiError::Cancelled)?, ); - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) + .with_scope_timeline(&timeline); let files = timeline .list_aux_files(body.lsn, &ctx, io_concurrency) .await?; @@ -3227,7 +3253,7 @@ async fn ingest_aux_files( modification .put_file(&fname, content.as_bytes(), &ctx) .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| ApiError::InternalServerError(e.into()))?; } modification .commit(&ctx) @@ -3356,11 +3382,11 @@ async fn put_tenant_timeline_import_basebackup( let broker_client = state.broker_client.clone(); - let mut body = StreamReader::new(request.into_body().map(|res| { - res.map_err(|error| { - std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error)) - }) - })); + let mut body = StreamReader::new( + request + .into_body() + .map(|res| res.map_err(|error| std::io::Error::other(anyhow::anyhow!(error)))), + ); tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; @@ -3422,18 +3448,19 @@ async fn put_tenant_timeline_import_wal( check_permission(&request, Some(tenant_id))?; - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn); async move { let state = get_state(&request); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?; - let ctx = RequestContextBuilder::extend(&ctx).scope(context::Scope::new_timeline(&timeline)).build(); + let ctx = RequestContextBuilder::new(TaskKind::MgmtRequest) + .download_behavior(DownloadBehavior::Warn) + .scope(context::Scope::new_timeline(&timeline)) + .root(); let mut body = StreamReader::new(request.into_body().map(|res| { res.map_err(|error| { - std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error)) + std::io::Error::other( anyhow::anyhow!(error)) }) })); @@ -3776,7 +3803,7 @@ pub fn make_router( ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/mark_invisible", - |r| testing_api_handler("mark timeline invisible", r, timeline_mark_invisible_handler), + |r| api_handler( r, timeline_mark_invisible_handler), ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint", diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 6dd005de50..911449c7c5 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -27,7 +27,7 @@ use crate::context::RequestContext; use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::Timeline; -use crate::walingest::WalIngest; +use crate::walingest::{WalIngest, WalIngestErrorKind}; // Returns checkpoint LSN from controlfile pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result { @@ -157,9 +157,9 @@ async fn import_rel( .put_rel_creation(rel, nblocks as u32, ctx) .await { - match e { - RelationError::AlreadyExists => { - debug!("Relation {} already exist. We must be extending it.", rel) + match e.kind { + WalIngestErrorKind::RelationAlreadyExists(rel) => { + debug!("Relation {rel} already exists. We must be extending it.") } _ => return Err(e.into()), } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 8373d0bd87..bda218444d 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -55,6 +55,9 @@ pub const DEFAULT_PG_VERSION: u32 = 16; pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; pub const DELTA_FILE_MAGIC: u16 = 0x5A61; +// Target used for performance traces. +pub const PERF_TRACE_TARGET: &str = "P"; + static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 9820d50e7b..2a779b0daa 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,10 +1,8 @@ use std::collections::HashMap; use std::num::NonZeroUsize; use std::os::fd::RawFd; -use std::pin::Pin; use std::sync::atomic::AtomicU64; use std::sync::{Arc, Mutex}; -use std::task::{Context, Poll}; use std::time::{Duration, Instant}; use enum_map::{Enum as _, EnumMap}; @@ -19,17 +17,17 @@ use metrics::{ use once_cell::sync::Lazy; use pageserver_api::config::{ PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, - PageServiceProtocolPipelinedExecutionStrategy, + PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy, }; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use pin_project_lite::pin_project; use postgres_backend::{QueryError, is_expected_io_error}; use pq_proto::framed::ConnectionError; use strum::{EnumCount, IntoEnumIterator as _, VariantNames}; use strum_macros::{IntoStaticStr, VariantNames}; use utils::id::TimelineId; +use crate::config; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext}; use crate::pgdatadir_mapping::DatadirModificationStats; @@ -499,6 +497,100 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy = Lazy::n .expect("failed to define a metric") }); +pub(crate) mod wait_ondemand_download_time { + use super::*; + const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[ + 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, // 10 ms - 100ms + 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, // 100ms to 1s + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, // 1s to 10s + 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, // 10s to 1m + ]; + + /// The task kinds for which we want to track wait times for on-demand downloads. + /// Other task kinds' wait times are accumulated in label value `unknown`. + pub(crate) const WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS: [TaskKind; 2] = [ + TaskKind::PageRequestHandler, + TaskKind::WalReceiverConnectionHandler, + ]; + + pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL: Lazy> = Lazy::new(|| { + let histo = register_histogram_vec!( + "pageserver_wait_ondemand_download_seconds_global", + "Observations are individual tasks' wait times for on-demand downloads. \ + If N tasks coalesce on an on-demand download, and it takes 10s, than we observe N * 10s.", + &["task_kind"], + WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS.into(), + ) + .expect("failed to define a metric"); + WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS + .iter() + .map(|task_kind| histo.with_label_values(&[task_kind.into()])) + .collect::>() + }); + + pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_SUM: Lazy = Lazy::new(|| { + register_counter_vec!( + // use a name that _could_ be evolved into a per-timeline histogram later + "pageserver_wait_ondemand_download_seconds_sum", + "Like `pageserver_wait_ondemand_download_seconds_global` but per timeline", + &["tenant_id", "shard_id", "timeline_id", "task_kind"], + ) + .unwrap() + }); + + pub struct WaitOndemandDownloadTimeSum { + counters: [Counter; WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS.len()], + } + + impl WaitOndemandDownloadTimeSum { + pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self { + let counters = WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS + .iter() + .map(|task_kind| { + WAIT_ONDEMAND_DOWNLOAD_TIME_SUM + .get_metric_with_label_values(&[ + tenant_id, + shard_id, + timeline_id, + task_kind.into(), + ]) + .unwrap() + }) + .collect::>(); + Self { + counters: counters.try_into().unwrap(), + } + } + pub(crate) fn observe(&self, task_kind: TaskKind, duration: Duration) { + let maybe = WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS + .iter() + .enumerate() + .find(|(_, kind)| **kind == task_kind); + let Some((idx, _)) = maybe else { + return; + }; + WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL[idx].observe(duration.as_secs_f64()); + let counter = &self.counters[idx]; + counter.inc_by(duration.as_secs_f64()); + } + } + + pub(crate) fn shutdown_timeline(tenant_id: &str, shard_id: &str, timeline_id: &str) { + for task_kind in WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS { + let _ = WAIT_ONDEMAND_DOWNLOAD_TIME_SUM.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + task_kind.into(), + ]); + } + } + + pub(crate) fn preinitialize_global_metrics() { + Lazy::force(&WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL); + } +} + static LAST_RECORD_LSN: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_last_record_lsn", @@ -1248,13 +1340,13 @@ pub(crate) static STORAGE_IO_TIME_METRIC: Lazy = Lazy::new(Storag #[derive(Clone, Copy)] #[repr(usize)] -enum StorageIoSizeOperation { +pub(crate) enum StorageIoSizeOperation { Read, Write, } impl StorageIoSizeOperation { - const VARIANTS: &'static [&'static str] = &["read", "write"]; + pub(crate) const VARIANTS: &'static [&'static str] = &["read", "write"]; fn as_str(&self) -> &'static str { Self::VARIANTS[*self as usize] @@ -1262,7 +1354,7 @@ impl StorageIoSizeOperation { } // Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1 -static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { +pub(crate) static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_io_operations_bytes_total", "Total amount of bytes read/written in IO operations", @@ -1622,6 +1714,28 @@ pub enum SmgrQueryType { Test, } +#[derive( + Debug, + Clone, + Copy, + IntoStaticStr, + strum_macros::EnumCount, + strum_macros::EnumIter, + strum_macros::FromRepr, + enum_map::Enum, +)] +#[strum(serialize_all = "snake_case")] +pub enum GetPageBatchBreakReason { + BatchFull, + NonBatchableRequest, + NonUniformLsn, + SamePageAtDifferentLsn, + NonUniformTimeline, + ExecutorSteal, + #[cfg(feature = "testing")] + NonUniformKey, +} + pub(crate) struct SmgrQueryTimePerTimeline { global_started: [IntCounter; SmgrQueryType::COUNT], global_latency: [Histogram; SmgrQueryType::COUNT], @@ -1633,6 +1747,8 @@ pub(crate) struct SmgrQueryTimePerTimeline { per_timeline_flush_in_progress_micros: IntCounter, global_batch_wait_time: Histogram, per_timeline_batch_wait_time: Histogram, + global_batch_break_reason: [IntCounter; GetPageBatchBreakReason::COUNT], + per_timeline_batch_break_reason: GetPageBatchBreakReasonTimelineMetrics, throttling: Arc, } @@ -1766,12 +1882,55 @@ static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy = Lazy::n .expect("failed to define a metric") }); +static PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL: Lazy = Lazy::new(|| { + register_int_counter_vec!( + // it's a counter, but, name is prepared to extend it to a histogram of queue depth + "pageserver_page_service_batch_break_reason_global", + "Reason for breaking batches of get page requests", + &["reason"], + ) + .expect("failed to define a metric") +}); + +struct GetPageBatchBreakReasonTimelineMetrics { + map: EnumMap, +} + +impl GetPageBatchBreakReasonTimelineMetrics { + fn new(tenant_id: &str, shard_slug: &str, timeline_id: &str) -> Self { + GetPageBatchBreakReasonTimelineMetrics { + map: EnumMap::from_array(std::array::from_fn(|reason_idx| { + let reason = GetPageBatchBreakReason::from_usize(reason_idx); + PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.with_label_values(&[ + tenant_id, + shard_slug, + timeline_id, + reason.into(), + ]) + })), + } + } + + fn inc(&self, reason: GetPageBatchBreakReason) { + self.map[reason].inc() + } +} + +static PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_page_service_batch_break_reason", + "Reason for breaking batches of get page requests", + &["tenant_id", "shard_id", "timeline_id", "reason"], + ) + .expect("failed to define a metric") +}); + pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_page_service_config_max_batch_size", "Configured maximum batch size for the server-side batching functionality of page_service. \ Labels expose more of the configuration parameters.", - &["mode", "execution"] + &["mode", "execution", "batching"] ) .expect("failed to define a metric") }); @@ -1779,10 +1938,11 @@ pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy = Lazy:: fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) { PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset(); let (label_values, value) = match conf { - PageServicePipeliningConfig::Serial => (["serial", "-"], 1), + PageServicePipeliningConfig::Serial => (["serial", "-", "-"], 1), PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { max_batch_size, execution, + batching, }) => { let mode = "pipelined"; let execution = match execution { @@ -1791,7 +1951,12 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) { } PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks", }; - ([mode, execution], max_batch_size.get()) + let batching = match batching { + PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => "uniform-lsn", + PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => "scattered-lsn", + }; + + ([mode, execution, batching], max_batch_size.get()) } }; PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE @@ -1887,6 +2052,15 @@ impl SmgrQueryTimePerTimeline { .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id]) .unwrap(); + let global_batch_break_reason = std::array::from_fn(|i| { + let reason = GetPageBatchBreakReason::from_usize(i); + PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL + .get_metric_with_label_values(&[reason.into()]) + .unwrap() + }); + let per_timeline_batch_break_reason = + GetPageBatchBreakReasonTimelineMetrics::new(&tenant_id, &shard_slug, &timeline_id); + let global_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone(); let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS @@ -1904,6 +2078,8 @@ impl SmgrQueryTimePerTimeline { per_timeline_flush_in_progress_micros, global_batch_wait_time, per_timeline_batch_wait_time, + global_batch_break_reason, + per_timeline_batch_break_reason, throttling: pagestream_throttle_metrics, } } @@ -1932,9 +2108,16 @@ impl SmgrQueryTimePerTimeline { } /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer - pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) { + pub(crate) fn observe_getpage_batch_start( + &self, + batch_size: usize, + break_reason: GetPageBatchBreakReason, + ) { self.global_batch_size.observe(batch_size as f64); self.per_timeline_batch_size.observe(batch_size as f64); + + self.global_batch_break_reason[break_reason.into_usize()].inc(); + self.per_timeline_batch_break_reason.inc(break_reason); } } @@ -2314,13 +2497,18 @@ impl RemoteOpFileKind { } } -pub(crate) static REMOTE_OPERATION_TIME: Lazy = Lazy::new(|| { +pub(crate) static REMOTE_TIMELINE_CLIENT_COMPLETION_LATENCY: Lazy = Lazy::new(|| { register_histogram_vec!( - "pageserver_remote_operation_seconds", - "Time spent on remote storage operations. \ - Grouped by tenant, timeline, operation_kind and status. \ + "pageserver_remote_timeline_client_seconds_global", + "Time spent on remote timeline client operations. \ + Grouped by task_kind, file_kind, operation_kind and status. \ + The task_kind is \ + - for layer downloads, populated from RequestContext (primary objective of having the label) \ + - for index downloads, set to 'unknown' \ + - for any upload operation, set to 'RemoteUploadTask' \ + This keeps dimensionality at bay. \ Does not account for time spent waiting in remote timeline client's queues.", - &["file_kind", "op_kind", "status"] + &["task_kind", "file_kind", "op_kind", "status"] ) .expect("failed to define a metric") }); @@ -2882,6 +3070,7 @@ pub(crate) struct TimelineMetrics { pub storage_io_size: StorageIoSizeMetrics, pub wait_lsn_in_progress_micros: GlobalAndPerTenantIntCounter, pub wait_lsn_start_finish_counterpair: IntCounterPair, + pub wait_ondemand_download_time: wait_ondemand_download_time::WaitOndemandDownloadTimeSum, shutdown: std::sync::atomic::AtomicBool, } @@ -3027,6 +3216,13 @@ impl TimelineMetrics { .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let wait_ondemand_download_time = + wait_ondemand_download_time::WaitOndemandDownloadTimeSum::new( + &tenant_id, + &shard_id, + &timeline_id, + ); + TimelineMetrics { tenant_id, shard_id, @@ -3060,6 +3256,7 @@ impl TimelineMetrics { wal_records_received, wait_lsn_in_progress_micros, wait_lsn_start_finish_counterpair, + wait_ondemand_download_time, shutdown: std::sync::atomic::AtomicBool::default(), } } @@ -3252,6 +3449,8 @@ impl TimelineMetrics { .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id]); } + wait_ondemand_download_time::shutdown_timeline(tenant_id, shard_id, timeline_id); + let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[ SmgrQueryType::GetPageAtLsn.into(), tenant_id, @@ -3284,6 +3483,15 @@ impl TimelineMetrics { shard_id, timeline_id, ]); + + for reason in GetPageBatchBreakReason::iter() { + let _ = PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + reason.into(), + ]); + } } } @@ -3373,13 +3581,18 @@ impl RemoteTimelineClientMetrics { pub fn remote_operation_time( &self, + task_kind: Option, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, status: &'static str, ) -> Histogram { - let key = (file_kind.as_str(), op_kind.as_str(), status); - REMOTE_OPERATION_TIME - .get_metric_with_label_values(&[key.0, key.1, key.2]) + REMOTE_TIMELINE_CLIENT_COMPLETION_LATENCY + .get_metric_with_label_values(&[ + task_kind.as_ref().map(|tk| tk.into()).unwrap_or("unknown"), + file_kind.as_str(), + op_kind.as_str(), + status, + ]) .unwrap() } @@ -3624,54 +3837,26 @@ impl Drop for RemoteTimelineClientMetrics { /// Wrapper future that measures the time spent by a remote storage operation, /// and records the time and success/failure as a prometheus metric. -pub(crate) trait MeasureRemoteOp: Sized { - fn measure_remote_op( +pub(crate) trait MeasureRemoteOp: Sized + Future> { + async fn measure_remote_op( self, + task_kind: Option, // not all caller contexts have a RequestContext / TaskKind handy file_kind: RemoteOpFileKind, op: RemoteOpKind, metrics: Arc, - ) -> MeasuredRemoteOp { + ) -> Result { let start = Instant::now(); - MeasuredRemoteOp { - inner: self, - file_kind, - op, - start, - metrics, - } + let res = self.await; + let duration = start.elapsed(); + let status = if res.is_ok() { &"success" } else { &"failure" }; + metrics + .remote_operation_time(task_kind, &file_kind, &op, status) + .observe(duration.as_secs_f64()); + res } } -impl MeasureRemoteOp for T {} - -pin_project! { - pub(crate) struct MeasuredRemoteOp - { - #[pin] - inner: F, - file_kind: RemoteOpFileKind, - op: RemoteOpKind, - start: Instant, - metrics: Arc, - } -} - -impl>, O, E> Future for MeasuredRemoteOp { - type Output = Result; - - fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { - let this = self.project(); - let poll_result = this.inner.poll(cx); - if let Poll::Ready(ref res) = poll_result { - let duration = this.start.elapsed(); - let status = if res.is_ok() { &"success" } else { &"failure" }; - this.metrics - .remote_operation_time(this.file_kind, this.op, status) - .observe(duration.as_secs_f64()); - } - poll_result - } -} +impl MeasureRemoteOp for Fut where Fut: Sized + Future> {} pub mod tokio_epoll_uring { use std::collections::HashMap; @@ -4107,9 +4292,33 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) { .set(u64::try_from(num_threads.get()).unwrap()); } -pub fn preinitialize_metrics(conf: &'static PageServerConf) { +static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_config_ignored_items", + "TOML items present in the on-disk configuration file but ignored by the pageserver config parser.\ + The `item` label is the dot-separated path of the ignored item in the on-disk configuration file.\ + The value for an unknown config item is always 1.\ + There is a special label value \"\", which is 0, so that there is always a metric exposed (simplifies dashboards).", + &["item"] + ) + .unwrap() +}); + +pub fn preinitialize_metrics( + conf: &'static PageServerConf, + ignored: config::ignored_fields::Paths, +) { set_page_service_config_max_batch_size(&conf.page_service_pipelining); + PAGESERVER_CONFIG_IGNORED_ITEMS + .with_label_values(&[""]) + .set(0); + for path in &ignored.paths { + PAGESERVER_CONFIG_IGNORED_ITEMS + .with_label_values(&[path]) + .set(1); + } + // Python tests need these and on some we do alerting. // // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of @@ -4161,6 +4370,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { [ &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT, &SMGR_QUERY_STARTED_GLOBAL, + &PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL, ] .into_iter() .for_each(|c| { @@ -4195,4 +4405,5 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE); tenant_throttling::preinitialize_global_metrics(); + wait_ondemand_download_time::preinitialize_global_metrics(); } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 2ed3e0ecb0..7a62d8049b 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -9,6 +9,7 @@ use std::sync::Arc; use std::time::{Duration, Instant, SystemTime}; use std::{io, str}; +use crate::PERF_TRACE_TARGET; use anyhow::{Context, bail}; use async_compression::tokio::write::GzipEncoder; use bytes::Buf; @@ -17,7 +18,7 @@ use itertools::Itertools; use once_cell::sync::OnceCell; use pageserver_api::config::{ PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, - PageServiceProtocolPipelinedExecutionStrategy, + PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy, }; use pageserver_api::key::rel_block_to_key; use pageserver_api::models::{ @@ -53,10 +54,12 @@ use utils::sync::spsc_fold; use crate::auth::check_permission; use crate::basebackup::BasebackupError; use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext}; +use crate::context::{ + DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, +}; use crate::metrics::{ - self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer, - TimelineMetrics, + self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS, + SmgrOpTimer, TimelineMetrics, }; use crate::pgdatadir_mapping::Version; use crate::span::{ @@ -100,7 +103,9 @@ pub fn spawn( conf: &'static PageServerConf, tenant_manager: Arc, pg_auth: Option>, + perf_trace_dispatch: Option, tcp_listener: tokio::net::TcpListener, + tls_config: Option>, ) -> Listener { let cancel = CancellationToken::new(); let libpq_ctx = RequestContext::todo_child( @@ -117,8 +122,10 @@ pub fn spawn( conf, tenant_manager, pg_auth, + perf_trace_dispatch, tcp_listener, conf.pg_auth_type, + tls_config, conf.page_service_pipelining.clone(), libpq_ctx, cancel.clone(), @@ -173,8 +180,10 @@ pub async fn libpq_listener_main( conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, + perf_trace_dispatch: Option, listener: tokio::net::TcpListener, auth_type: AuthType, + tls_config: Option>, pipelining_config: PageServicePipeliningConfig, listener_ctx: RequestContext, listener_cancel: CancellationToken, @@ -205,14 +214,19 @@ pub async fn libpq_listener_main( // Connection established. Spawn a new task to handle it. debug!("accepted connection from {}", peer_addr); let local_auth = auth.clone(); - let connection_ctx = listener_ctx - .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download); + let connection_ctx = RequestContextBuilder::from(&listener_ctx) + .task_kind(TaskKind::PageRequestHandler) + .download_behavior(DownloadBehavior::Download) + .perf_span_dispatch(perf_trace_dispatch.clone()) + .detached_child(); + connection_handler_tasks.spawn(page_service_conn_main( conf, tenant_manager.clone(), local_auth, socket, auth_type, + tls_config.clone(), pipelining_config.clone(), connection_ctx, connections_cancel.child_token(), @@ -237,6 +251,15 @@ pub async fn libpq_listener_main( type ConnectionHandlerResult = anyhow::Result<()>; +/// Perf root spans start at the per-request level, after shard routing. +/// This struct carries connection-level information to the root perf span definition. +#[derive(Clone)] +struct ConnectionPerfSpanFields { + peer_addr: String, + application_name: Option, + compute_mode: Option, +} + #[instrument(skip_all, fields(peer_addr, application_name, compute_mode))] #[allow(clippy::too_many_arguments)] async fn page_service_conn_main( @@ -245,6 +268,7 @@ async fn page_service_conn_main( auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, + tls_config: Option>, pipelining_config: PageServicePipeliningConfig, connection_ctx: RequestContext, cancel: CancellationToken, @@ -261,6 +285,12 @@ async fn page_service_conn_main( let socket_fd = socket.as_raw_fd(); let peer_addr = socket.peer_addr().context("get peer address")?; + + let perf_span_fields = ConnectionPerfSpanFields { + peer_addr: peer_addr.to_string(), + application_name: None, // filled in later + compute_mode: None, // filled in later + }; tracing::Span::current().record("peer_addr", field::display(peer_addr)); // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements: @@ -304,11 +334,13 @@ async fn page_service_conn_main( tenant_manager, auth, pipelining_config, + perf_span_fields, connection_ctx, cancel.clone(), gate_guard, ); - let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?; + let pgbackend = + PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?; match pgbackend.run(&mut conn_handler, &cancel).await { Ok(()) => { @@ -348,6 +380,8 @@ struct PageServerHandler { /// `process_query` creates a child context from this one. connection_ctx: RequestContext, + perf_span_fields: ConnectionPerfSpanFields, + cancel: CancellationToken, /// None only while pagestream protocol is being processed. @@ -607,6 +641,8 @@ impl std::fmt::Display for BatchedPageStreamError { struct BatchedGetPageRequest { req: PagestreamGetPageRequest, timer: SmgrOpTimer, + effective_request_lsn: Lsn, + ctx: RequestContext, } #[cfg(feature = "testing")] @@ -635,8 +671,8 @@ enum BatchedFeMessage { GetPage { span: Span, shard: timeline::handle::WeakHandle, - effective_request_lsn: Lsn, pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>, + batch_break_reason: GetPageBatchBreakReason, }, DbSize { span: Span, @@ -689,14 +725,129 @@ impl BatchedFeMessage { BatchedFeMessage::RespondError { .. } => {} } } + + fn should_break_batch( + &self, + other: &BatchedFeMessage, + max_batch_size: NonZeroUsize, + batching_strategy: PageServiceProtocolPipelinedBatchingStrategy, + ) -> Option { + match (self, other) { + ( + BatchedFeMessage::GetPage { + shard: accum_shard, + pages: accum_pages, + .. + }, + BatchedFeMessage::GetPage { + shard: this_shard, + pages: this_pages, + .. + }, + ) => { + assert_eq!(this_pages.len(), 1); + if accum_pages.len() >= max_batch_size.get() { + trace!(%max_batch_size, "stopping batching because of batch size"); + assert_eq!(accum_pages.len(), max_batch_size.get()); + + return Some(GetPageBatchBreakReason::BatchFull); + } + if !accum_shard.is_same_handle_as(this_shard) { + trace!("stopping batching because timeline object mismatch"); + // TODO: we _could_ batch & execute each shard seperately (and in parallel). + // But the current logic for keeping responses in order does not support that. + + return Some(GetPageBatchBreakReason::NonUniformTimeline); + } + + match batching_strategy { + PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => { + if let Some(last_in_batch) = accum_pages.last() { + if last_in_batch.effective_request_lsn + != this_pages[0].effective_request_lsn + { + trace!( + accum_lsn = %last_in_batch.effective_request_lsn, + this_lsn = %this_pages[0].effective_request_lsn, + "stopping batching because LSN changed" + ); + + return Some(GetPageBatchBreakReason::NonUniformLsn); + } + } + } + PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => { + // The read path doesn't curently support serving the same page at different LSNs. + // While technically possible, it's uncertain if the complexity is worth it. + // Break the batch if such a case is encountered. + let same_page_different_lsn = accum_pages.iter().any(|batched| { + batched.req.rel == this_pages[0].req.rel + && batched.req.blkno == this_pages[0].req.blkno + && batched.effective_request_lsn + != this_pages[0].effective_request_lsn + }); + + if same_page_different_lsn { + trace!( + rel=%this_pages[0].req.rel, + blkno=%this_pages[0].req.blkno, + lsn=%this_pages[0].effective_request_lsn, + "stopping batching because same page was requested at different LSNs" + ); + + return Some(GetPageBatchBreakReason::SamePageAtDifferentLsn); + } + } + } + + None + } + #[cfg(feature = "testing")] + ( + BatchedFeMessage::Test { + shard: accum_shard, + requests: accum_requests, + .. + }, + BatchedFeMessage::Test { + shard: this_shard, + requests: this_requests, + .. + }, + ) => { + assert!(this_requests.len() == 1); + if accum_requests.len() >= max_batch_size.get() { + trace!(%max_batch_size, "stopping batching because of batch size"); + assert_eq!(accum_requests.len(), max_batch_size.get()); + return Some(GetPageBatchBreakReason::BatchFull); + } + if !accum_shard.is_same_handle_as(this_shard) { + trace!("stopping batching because timeline object mismatch"); + // TODO: we _could_ batch & execute each shard seperately (and in parallel). + // But the current logic for keeping responses in order does not support that. + return Some(GetPageBatchBreakReason::NonUniformTimeline); + } + let this_batch_key = this_requests[0].req.batch_key; + let accum_batch_key = accum_requests[0].req.batch_key; + if this_requests[0].req.batch_key != accum_requests[0].req.batch_key { + trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed"); + return Some(GetPageBatchBreakReason::NonUniformKey); + } + None + } + (_, _) => Some(GetPageBatchBreakReason::NonBatchableRequest), + } + } } impl PageServerHandler { + #[allow(clippy::too_many_arguments)] pub fn new( conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, pipelining_config: PageServicePipeliningConfig, + perf_span_fields: ConnectionPerfSpanFields, connection_ctx: RequestContext, cancel: CancellationToken, gate_guard: GateGuard, @@ -706,6 +857,7 @@ impl PageServerHandler { auth, claims: None, connection_ctx, + perf_span_fields, timeline_handles: Some(TimelineHandles::new(tenant_manager)), cancel, pipelining_config, @@ -743,6 +895,7 @@ impl PageServerHandler { tenant_id: TenantId, timeline_id: TimelineId, timeline_handles: &mut TimelineHandles, + conn_perf_span_fields: &ConnectionPerfSpanFields, cancel: &CancellationToken, ctx: &RequestContext, protocol_version: PagestreamProtocolVersion, @@ -902,10 +1055,12 @@ impl PageServerHandler { } let key = rel_block_to_key(req.rel, req.blkno); - let shard = match timeline_handles + + let res = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Page(key)) - .await - { + .await; + + let shard = match res { Ok(tl) => tl, Err(e) => { let span = mkspan!(before shard routing); @@ -932,6 +1087,41 @@ impl PageServerHandler { } } }; + + let ctx = if shard.is_get_page_request_sampled() { + RequestContextBuilder::from(ctx) + .root_perf_span(|| { + info_span!( + target: PERF_TRACE_TARGET, + "GET_PAGE", + peer_addr = conn_perf_span_fields.peer_addr, + application_name = conn_perf_span_fields.application_name, + compute_mode = conn_perf_span_fields.compute_mode, + tenant_id = %tenant_id, + shard_id = %shard.get_shard_identity().shard_slug(), + timeline_id = %timeline_id, + lsn = %req.hdr.request_lsn, + request_id = %req.hdr.reqid, + key = %key, + ) + }) + .attached_child() + } else { + ctx.attached_child() + }; + + // This ctx travels as part of the BatchedFeMessage through + // batching into the request handler. + // The request handler needs to do some per-request work + // (relsize check) before dispatching the batch as a single + // get_vectored call to the Timeline. + // This ctx will be used for the reslize check, whereas the + // get_vectored call will be a different ctx with separate + // perf span. + let ctx = ctx.with_scope_page_service_pagestream(&shard); + + // Similar game for this `span`: we funnel it through so that + // request handler log messages contain the request-specific fields. let span = mkspan!(shard.tenant_shard_id.shard_slug()); let timer = record_op_start_and_throttle( @@ -939,29 +1129,42 @@ impl PageServerHandler { metrics::SmgrQueryType::GetPageAtLsn, received_at, ) + .maybe_perf_instrument(&ctx, |current_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: current_perf_span, + "THROTTLE", + ) + }) .await?; // We're holding the Handle - let effective_request_lsn = match Self::wait_or_get_last_lsn( + let effective_request_lsn = match Self::effective_request_lsn( &shard, + shard.get_last_record_lsn(), req.hdr.request_lsn, req.hdr.not_modified_since, &shard.get_applied_gc_cutoff_lsn(), - ctx, - ) - // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait - .await - { + ) { Ok(lsn) => lsn, Err(e) => { return respond_error!(span, e); } }; + BatchedFeMessage::GetPage { span, shard: shard.downgrade(), - effective_request_lsn, - pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }], + pages: smallvec::smallvec![BatchedGetPageRequest { + req, + timer, + effective_request_lsn, + ctx, + }], + // The executor grabs the batch when it becomes idle. + // Hence, [`GetPageBatchBreakReason::ExecutorSteal`] is the + // default reason for breaking the batch. + batch_break_reason: GetPageBatchBreakReason::ExecutorSteal, } } #[cfg(feature = "testing")] @@ -987,6 +1190,7 @@ impl PageServerHandler { #[instrument(skip_all, level = tracing::Level::TRACE)] #[allow(clippy::boxed_local)] fn pagestream_do_batch( + batching_strategy: PageServiceProtocolPipelinedBatchingStrategy, max_batch_size: NonZeroUsize, batch: &mut Result, this_msg: Result, @@ -998,90 +1202,59 @@ impl PageServerHandler { Err(e) => return Err(Err(e)), }; - match (&mut *batch, this_msg) { - // something batched already, let's see if we can add this message to the batch - ( - Ok(BatchedFeMessage::GetPage { - span: _, - shard: accum_shard, - pages: accum_pages, - effective_request_lsn: accum_lsn, - }), - BatchedFeMessage::GetPage { - span: _, - shard: this_shard, - pages: this_pages, - effective_request_lsn: this_lsn, - }, - ) if (|| { - assert_eq!(this_pages.len(), 1); - if accum_pages.len() >= max_batch_size.get() { - trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size"); - assert_eq!(accum_pages.len(), max_batch_size.get()); - return false; - } - if !accum_shard.is_same_handle_as(&this_shard) { - trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch"); - // TODO: we _could_ batch & execute each shard seperately (and in parallel). - // But the current logic for keeping responses in order does not support that. - return false; - } - // the vectored get currently only supports a single LSN, so, bounce as soon - // as the effective request_lsn changes - if *accum_lsn != this_lsn { - trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed"); - return false; - } - true - })() => - { - // ok to batch - accum_pages.extend(this_pages); - Ok(()) + let eligible_batch = match batch { + Ok(b) => b, + Err(_) => { + return Err(Ok(this_msg)); } - #[cfg(feature = "testing")] - ( - Ok(BatchedFeMessage::Test { - shard: accum_shard, - requests: accum_requests, - .. - }), - BatchedFeMessage::Test { - shard: this_shard, - requests: this_requests, - .. - }, - ) if (|| { - assert!(this_requests.len() == 1); - if accum_requests.len() >= max_batch_size.get() { - trace!(%max_batch_size, "stopping batching because of batch size"); - assert_eq!(accum_requests.len(), max_batch_size.get()); - return false; + }; + + let batch_break = + eligible_batch.should_break_batch(&this_msg, max_batch_size, batching_strategy); + + match batch_break { + Some(reason) => { + if let BatchedFeMessage::GetPage { + batch_break_reason, .. + } = eligible_batch + { + *batch_break_reason = reason; } - if !accum_shard.is_same_handle_as(&this_shard) { - trace!("stopping batching because timeline object mismatch"); - // TODO: we _could_ batch & execute each shard seperately (and in parallel). - // But the current logic for keeping responses in order does not support that. - return false; - } - let this_batch_key = this_requests[0].req.batch_key; - let accum_batch_key = accum_requests[0].req.batch_key; - if this_requests[0].req.batch_key != accum_requests[0].req.batch_key { - trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed"); - return false; - } - true - })() => - { - // ok to batch - accum_requests.extend(this_requests); - Ok(()) - } - // something batched already but this message is unbatchable - (_, this_msg) => { - // by default, don't continue batching + Err(Ok(this_msg)) } + None => { + // ok to batch + match (eligible_batch, this_msg) { + ( + BatchedFeMessage::GetPage { + pages: accum_pages, .. + }, + BatchedFeMessage::GetPage { + pages: this_pages, .. + }, + ) => { + accum_pages.extend(this_pages); + Ok(()) + } + #[cfg(feature = "testing")] + ( + BatchedFeMessage::Test { + requests: accum_requests, + .. + }, + BatchedFeMessage::Test { + requests: this_requests, + .. + }, + ) => { + accum_requests.extend(this_requests); + Ok(()) + } + // Shape guaranteed by [`BatchedFeMessage::should_break_batch`] + _ => unreachable!(), + } + } } } @@ -1302,8 +1475,8 @@ impl PageServerHandler { BatchedFeMessage::GetPage { span, shard, - effective_request_lsn, pages, + batch_break_reason, } => { fail::fail_point!("ps::handle-pagerequest-message::getpage"); let (shard, ctx) = upgrade_handle_and_set_context!(shard); @@ -1314,9 +1487,9 @@ impl PageServerHandler { let res = self .handle_get_page_at_lsn_request_batched( &shard, - effective_request_lsn, pages, io_concurrency, + batch_break_reason, &ctx, ) .instrument(span.clone()) @@ -1514,12 +1687,14 @@ impl PageServerHandler { IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, { let cancel = self.cancel.clone(); + let err = loop { let msg = Self::pagestream_read_message( &mut pgb_reader, tenant_id, timeline_id, &mut timeline_handles, + &self.perf_span_fields, &cancel, ctx, protocol_version, @@ -1631,6 +1806,7 @@ impl PageServerHandler { let PageServicePipeliningConfigPipelined { max_batch_size, execution, + batching: batching_strategy, } = pipelining_config; // Macro to _define_ a pipeline stage. @@ -1653,6 +1829,8 @@ impl PageServerHandler { // Batcher // + let perf_span_fields = self.perf_span_fields.clone(); + let cancel_batcher = self.cancel.child_token(); let (mut batch_tx, mut batch_rx) = spsc_fold::channel(); let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| { @@ -1666,6 +1844,7 @@ impl PageServerHandler { tenant_id, timeline_id, &mut timeline_handles, + &perf_span_fields, &cancel_batcher, &ctx, protocol_version, @@ -1679,7 +1858,7 @@ impl PageServerHandler { exit |= read_res.is_err(); let could_send = batch_tx .send(read_res, |batch, res| { - Self::pagestream_do_batch(max_batch_size, batch, res) + Self::pagestream_do_batch(batching_strategy, max_batch_size, batch, res) }) .await; exit |= could_send.is_err(); @@ -1775,7 +1954,39 @@ impl PageServerHandler { ctx: &RequestContext, ) -> Result { let last_record_lsn = timeline.get_last_record_lsn(); + let effective_request_lsn = Self::effective_request_lsn( + timeline, + last_record_lsn, + request_lsn, + not_modified_since, + latest_gc_cutoff_lsn, + )?; + if effective_request_lsn > last_record_lsn { + timeline + .wait_lsn( + not_modified_since, + crate::tenant::timeline::WaitLsnWaiter::PageService, + timeline::WaitLsnTimeout::Default, + ctx, + ) + .await?; + + // Since we waited for 'effective_request_lsn' to arrive, that is now the last + // record LSN. (Or close enough for our purposes; the last-record LSN can + // advance immediately after we return anyway) + } + + Ok(effective_request_lsn) + } + + fn effective_request_lsn( + timeline: &Timeline, + last_record_lsn: Lsn, + request_lsn: Lsn, + not_modified_since: Lsn, + latest_gc_cutoff_lsn: &RcuReadGuard, + ) -> Result { // Sanity check the request if request_lsn < not_modified_since { return Err(PageStreamError::BadRequest( @@ -1810,19 +2021,7 @@ impl PageServerHandler { } } - // Wait for WAL up to 'not_modified_since' to arrive, if necessary if not_modified_since > last_record_lsn { - timeline - .wait_lsn( - not_modified_since, - crate::tenant::timeline::WaitLsnWaiter::PageService, - timeline::WaitLsnTimeout::Default, - ctx, - ) - .await?; - // Since we waited for 'not_modified_since' to arrive, that is now the last - // record LSN. (Or close enough for our purposes; the last-record LSN can - // advance immediately after we return anyway) Ok(not_modified_since) } else { // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn) @@ -1977,16 +2176,16 @@ impl PageServerHandler { async fn handle_get_page_at_lsn_request_batched( &mut self, timeline: &Timeline, - effective_lsn: Lsn, requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>, io_concurrency: IoConcurrency, + batch_break_reason: GetPageBatchBreakReason, ctx: &RequestContext, ) -> Vec> { debug_assert_current_span_has_tenant_and_timeline_id(); timeline .query_metrics - .observe_getpage_batch_start(requests.len()); + .observe_getpage_batch_start(requests.len(), batch_break_reason); // If a page trace is running, submit an event for this request. if let Some(page_trace) = timeline.page_trace.load().as_ref() { @@ -1996,18 +2195,81 @@ impl PageServerHandler { // Ignore error (trace buffer may be full or tracer may have disconnected). _ = page_trace.try_send(PageTraceEvent { key, - effective_lsn, + effective_lsn: batch.effective_request_lsn, time, }); } } + // If any request in the batch needs to wait for LSN, then do so now. + let mut perf_instrument = false; + let max_effective_lsn = requests + .iter() + .map(|req| { + if req.ctx.has_perf_span() { + perf_instrument = true; + } + + req.effective_request_lsn + }) + .max() + .expect("batch is never empty"); + + let ctx = match perf_instrument { + true => RequestContextBuilder::from(ctx) + .root_perf_span(|| { + info_span!( + target: PERF_TRACE_TARGET, + "GET_VECTORED", + tenant_id = %timeline.tenant_shard_id.tenant_id, + timeline_id = %timeline.timeline_id, + shard = %timeline.tenant_shard_id.shard_slug(), + %max_effective_lsn + ) + }) + .attached_child(), + false => ctx.attached_child(), + }; + + let last_record_lsn = timeline.get_last_record_lsn(); + if max_effective_lsn > last_record_lsn { + if let Err(e) = timeline + .wait_lsn( + max_effective_lsn, + crate::tenant::timeline::WaitLsnWaiter::PageService, + timeline::WaitLsnTimeout::Default, + &ctx, + ) + .maybe_perf_instrument(&ctx, |current_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: current_perf_span, + "WAIT_LSN", + ) + }) + .await + { + return Vec::from_iter(requests.into_iter().map(|req| { + Err(BatchedPageStreamError { + err: PageStreamError::from(e.clone()), + req: req.req.hdr, + }) + })); + } + } + let results = timeline .get_rel_page_at_lsn_batched( - requests.iter().map(|p| (&p.req.rel, &p.req.blkno)), - effective_lsn, + requests.iter().map(|p| { + ( + &p.req.rel, + &p.req.blkno, + p.effective_request_lsn, + p.ctx.attached_child(), + ) + }), io_concurrency, - ctx, + &ctx, ) .await; assert_eq!(results.len(), requests.len()); @@ -2606,12 +2868,14 @@ where if let FeStartupPacket::StartupMessage { params, .. } = sm { if let Some(app_name) = params.get("application_name") { + self.perf_span_fields.application_name = Some(app_name.to_string()); Span::current().record("application_name", field::display(app_name)); } if let Some(options) = params.get("options") { let (config, _) = parse_options(options); for (key, value) in config { if key == "neon.compute_mode" { + self.perf_span_fields.compute_mode = Some(value.clone()); Span::current().record("compute_mode", field::display(value)); } } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 4685f9383b..81e548a095 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,13 +6,14 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! -use std::collections::{BTreeMap, HashMap, HashSet, hash_map}; +use std::collections::{HashMap, HashSet, hash_map}; use std::ops::{ControlFlow, Range}; -use anyhow::{Context, ensure}; +use crate::walingest::{WalIngestError, WalIngestErrorKind}; +use crate::{PERF_TRACE_TARGET, ensure_walingest}; +use anyhow::Context; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; -use itertools::Itertools; use pageserver_api::key::{ AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists, TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, @@ -20,7 +21,7 @@ use pageserver_api::key::{ repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, }; -use pageserver_api::keyspace::SparseKeySpace; +use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace}; use pageserver_api::models::RelSizeMigration; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; @@ -31,7 +32,7 @@ use postgres_ffi::{BLCKSZ, Oid, RepOriginId, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, trace, warn}; +use tracing::{debug, info, info_span, trace, warn}; use utils::bin_ser::{BeSer, DeserializeError}; use utils::lsn::Lsn; use utils::pausable_failpoint; @@ -39,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use super::tenant::{PageReconstructError, Timeline}; use crate::aux_file; -use crate::context::RequestContext; +use crate::context::{PerfInstrumentFutureExt, RequestContext}; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::metrics::{ RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD, @@ -49,7 +50,7 @@ use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, }; use crate::tenant::storage_layer::IoConcurrency; -use crate::tenant::timeline::GetVectoredError; +use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery}; /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached. pub const MAX_AUX_FILE_DELTAS: usize = 1024; @@ -135,12 +136,8 @@ impl From for CalculateLogicalSizeError { #[derive(Debug, thiserror::Error)] pub enum RelationError { - #[error("Relation Already Exists")] - AlreadyExists, #[error("invalid relnode")] InvalidRelnode, - #[error(transparent)] - Other(#[from] anyhow::Error), } /// @@ -209,8 +206,9 @@ impl Timeline { let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)]; let res = self .get_rel_page_at_lsn_batched( - pages.iter().map(|(tag, blknum)| (tag, blknum)), - effective_lsn, + pages.iter().map(|(tag, blknum)| { + (tag, blknum, effective_lsn, ctx.attached_child()) + }), io_concurrency.clone(), ctx, ) @@ -248,8 +246,7 @@ impl Timeline { /// The ordering of the returned vec corresponds to the ordering of `pages`. pub(crate) async fn get_rel_page_at_lsn_batched( &self, - pages: impl ExactSizeIterator, - effective_lsn: Lsn, + pages: impl ExactSizeIterator, io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> Vec> { @@ -262,8 +259,13 @@ impl Timeline { let mut result = Vec::with_capacity(pages.len()); let result_slots = result.spare_capacity_mut(); - let mut keys_slots: BTreeMap> = BTreeMap::default(); - for (response_slot_idx, (tag, blknum)) in pages.enumerate() { + let mut keys_slots: HashMap> = + HashMap::with_capacity(pages.len()); + + let mut req_keyspaces: HashMap = + HashMap::with_capacity(pages.len()); + + for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() { if tag.relnode == 0 { result_slots[response_slot_idx].write(Err(PageReconstructError::Other( RelationError::InvalidRelnode.into(), @@ -274,7 +276,16 @@ impl Timeline { } let nblocks = match self - .get_rel_size(*tag, Version::Lsn(effective_lsn), ctx) + .get_rel_size(*tag, Version::Lsn(lsn), &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "GET_REL_SIZE", + reltag=%tag, + lsn=%lsn, + ) + }) .await { Ok(nblocks) => nblocks, @@ -288,7 +299,7 @@ impl Timeline { if *blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", - tag, blknum, effective_lsn, nblocks + tag, blknum, lsn, nblocks ); result_slots[response_slot_idx].write(Ok(ZERO_PAGE.clone())); slots_filled += 1; @@ -298,32 +309,37 @@ impl Timeline { let key = rel_block_to_key(*tag, *blknum); let key_slots = keys_slots.entry(key).or_default(); - key_slots.push(response_slot_idx); + key_slots.push((response_slot_idx, ctx)); + + let acc = req_keyspaces.entry(lsn).or_default(); + acc.add_key(key); } - let keyspace = { - // add_key requires monotonicity - let mut acc = KeySpaceAccum::new(); - for key in keys_slots - .keys() - // in fact it requires strong monotonicity - .dedup() - { - acc.add_key(*key); - } - acc.to_keyspace() - }; + let query: Vec<(Lsn, KeySpace)> = req_keyspaces + .into_iter() + .map(|(lsn, acc)| (lsn, acc.to_keyspace())) + .collect(); - match self - .get_vectored(keyspace, effective_lsn, io_concurrency, ctx) - .await - { + let query = VersionedKeySpaceQuery::scattered(query); + let res = self + .get_vectored(query, io_concurrency, ctx) + .maybe_perf_instrument(ctx, |current_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: current_perf_span, + "GET_BATCH", + batch_size = %page_count, + ) + }) + .await; + + match res { Ok(results) => { for (key, res) in results { let mut key_slots = keys_slots.remove(&key).unwrap().into_iter(); - let first_slot = key_slots.next().unwrap(); + let (first_slot, first_req_ctx) = key_slots.next().unwrap(); - for slot in key_slots { + for (slot, req_ctx) in key_slots { let clone = match &res { Ok(buf) => Ok(buf.clone()), Err(err) => Err(match err { @@ -341,17 +357,22 @@ impl Timeline { }; result_slots[slot].write(clone); + // There is no standardized way to express that the batched span followed from N request spans. + // So, abuse the system and mark the request contexts as follows_from the batch span, so we get + // some linkage in our trace viewer. It allows us to answer: which GET_VECTORED did this GET_PAGE wait for. + req_ctx.perf_follows_from(ctx); slots_filled += 1; } result_slots[first_slot].write(res); + first_req_ctx.perf_follows_from(ctx); slots_filled += 1; } } Err(err) => { // this cannot really happen because get_vectored only errors globally on invalid LSN or too large batch size // (We enforce the max batch size outside of this function, in the code that constructs the batch request.) - for slot in keys_slots.values().flatten() { + for (slot, req_ctx) in keys_slots.values().flatten() { // this whole `match` is a lot like `From for PageReconstructError` // but without taking ownership of the GetVectoredError let err = match &err { @@ -383,6 +404,7 @@ impl Timeline { } }; + req_ctx.perf_follows_from(ctx); result_slots[*slot].write(err); } @@ -621,8 +643,9 @@ impl Timeline { let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize); for batch in batches.parts { + let query = VersionedKeySpaceQuery::uniform(batch, lsn); let blocks = self - .get_vectored(batch, lsn, io_concurrency.clone(), ctx) + .get_vectored(query, io_concurrency.clone(), ctx) .await?; for (_key, block) in blocks { @@ -648,7 +671,7 @@ impl Timeline { Ok(buf.get_u32_le()) } - /// Get size of an SLRU segment + /// Does the slru segment exist? pub(crate) async fn get_slru_segment_exists( &self, kind: SlruKind, @@ -801,9 +824,9 @@ impl Timeline { .await } - /// Obtain the possible timestamp range for the given lsn. + /// Obtain the timestamp for the given lsn. /// - /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps. + /// If the lsn has no timestamps (e.g. no commits), returns None. pub(crate) async fn get_timestamp_for_lsn( &self, probe_lsn: Lsn, @@ -859,8 +882,9 @@ impl Timeline { ); for batch in batches.parts.into_iter().rev() { + let query = VersionedKeySpaceQuery::uniform(batch, probe_lsn); let blocks = self - .get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx) + .get_vectored(query, io_concurrency.clone(), ctx) .await?; for (_key, clog_page) in blocks.into_iter().rev() { @@ -1435,8 +1459,8 @@ impl DatadirModification<'_> { } /// Set the current lsn - pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> { - ensure!( + pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> { + ensure_walingest!( lsn >= self.lsn, "setting an older lsn {} than {} is not allowed", lsn, @@ -1535,7 +1559,7 @@ impl DatadirModification<'_> { &mut self, rel: RelTag, ctx: &RequestContext, - ) -> Result { + ) -> Result { // Get current size and put rel creation if rel doesn't exist // // NOTE: we check the cache first even though get_rel_exists and get_rel_size would @@ -1550,14 +1574,13 @@ impl DatadirModification<'_> { .await? { // create it with 0 size initially, the logic below will extend it - self.put_rel_creation(rel, 0, ctx) - .await - .context("Relation Error")?; + self.put_rel_creation(rel, 0, ctx).await?; Ok(0) } else { - self.tline + Ok(self + .tline .get_rel_size(rel, Version::Modified(self), ctx) - .await + .await?) } } @@ -1594,11 +1617,14 @@ impl DatadirModification<'_> { // TODO(vlad): remove this argument and replace the shard check with is_key_local shard: &ShardIdentity, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let mut gaps_at_lsns = Vec::default(); for meta in batch.metadata.iter() { - let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?; + let key = Key::from_compact(meta.key()); + let (rel, blkno) = key + .to_rel_block() + .map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?; let new_nblocks = blkno + 1; let old_nblocks = self.create_relation_if_required(rel, ctx).await?; @@ -1640,8 +1666,8 @@ impl DatadirModification<'_> { rel: RelTag, blknum: BlockNumber, rec: NeonWalRecord, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); Ok(()) } @@ -1653,7 +1679,7 @@ impl DatadirModification<'_> { segno: u32, blknum: BlockNumber, rec: NeonWalRecord, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { if !self.tline.tenant_shard_id.is_shard_zero() { return Ok(()); } @@ -1671,14 +1697,11 @@ impl DatadirModification<'_> { rel: RelTag, blknum: BlockNumber, img: Bytes, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); let key = rel_block_to_key(rel, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver at {}", - key - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } self.put(rel_block_to_key(rel, blknum), Value::Image(img)); Ok(()) @@ -1690,15 +1713,12 @@ impl DatadirModification<'_> { segno: u32, blknum: BlockNumber, img: Bytes, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver at {}", - key - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } self.put(key, Value::Image(img)); Ok(()) @@ -1708,15 +1728,11 @@ impl DatadirModification<'_> { &mut self, rel: RelTag, blknum: BlockNumber, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); let key = rel_block_to_key(rel, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver: {} @ {}", - key, - self.lsn - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } let batch = self @@ -1733,15 +1749,11 @@ impl DatadirModification<'_> { kind: SlruKind, segno: u32, blknum: BlockNumber, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver: {} @ {}", - key, - self.lsn - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } let batch = self @@ -1789,8 +1801,10 @@ impl DatadirModification<'_> { dbnode: Oid, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { - let v2_enabled = self.maybe_enable_rel_size_v2()?; + ) -> Result<(), WalIngestError> { + let v2_enabled = self + .maybe_enable_rel_size_v2() + .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; // Add it to the directory (if it doesn't exist already) let buf = self.get(DBDIR_KEY, ctx).await?; @@ -1831,13 +1845,13 @@ impl DatadirModification<'_> { xid: u64, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // Add it to the directory entry let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?; let newdirbuf = if self.tline.pg_version >= 17 { let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?; if !dir.xids.insert(xid) { - anyhow::bail!("twophase file for xid {} already exists", xid); + Err(WalIngestErrorKind::FileAlreadyExists(xid))?; } self.pending_directory_entries.push(( DirectoryKind::TwoPhase, @@ -1848,7 +1862,7 @@ impl DatadirModification<'_> { let xid = xid as u32; let mut dir = TwoPhaseDirectory::des(&dirbuf)?; if !dir.xids.insert(xid) { - anyhow::bail!("twophase file for xid {} already exists", xid); + Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?; } self.pending_directory_entries.push(( DirectoryKind::TwoPhase, @@ -1866,22 +1880,22 @@ impl DatadirModification<'_> { &mut self, origin_id: RepOriginId, origin_lsn: Lsn, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let key = repl_origin_key(origin_id); self.put(key, Value::Image(origin_lsn.ser().unwrap().into())); Ok(()) } - pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> { + pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> { self.set_replorigin(origin_id, Lsn::INVALID).await } - pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> { + pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> { self.put(CONTROLFILE_KEY, Value::Image(img)); Ok(()) } - pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> { + pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> { self.put(CHECKPOINT_KEY, Value::Image(img)); Ok(()) } @@ -1891,7 +1905,7 @@ impl DatadirModification<'_> { spcnode: Oid, dbnode: Oid, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let total_blocks = self .tline .get_db_size(spcnode, dbnode, Version::Modified(self), ctx) @@ -1930,20 +1944,21 @@ impl DatadirModification<'_> { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> Result<(), RelationError> { + ) -> Result<(), WalIngestError> { if rel.relnode == 0 { - return Err(RelationError::InvalidRelnode); + Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!( + "invalid relnode" + )))?; } // It's possible that this is the first rel for this db in this // tablespace. Create the reldir entry for it if so. - let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?) - .context("deserialize db")?; + let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?; let dbdir_exists = if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { // Didn't exist. Update dbdir e.insert(false); - let buf = DbDirectory::ser(&dbdir).context("serialize db")?; + let buf = DbDirectory::ser(&dbdir)?; self.pending_directory_entries.push(( DirectoryKind::Db, MetricsUpdate::Set(dbdir.dbdirs.len() as u64), @@ -1960,27 +1975,25 @@ impl DatadirModification<'_> { RelDirectory::default() } else { // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) - .context("deserialize db")? + RelDirectory::des(&self.get(rel_dir_key, ctx).await?)? }; - let v2_enabled = self.maybe_enable_rel_size_v2()?; + let v2_enabled = self + .maybe_enable_rel_size_v2() + .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; if v2_enabled { if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) { - return Err(RelationError::AlreadyExists); + Err(WalIngestErrorKind::RelationAlreadyExists(rel))?; } let sparse_rel_dir_key = rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum); // check if the rel_dir_key exists in v2 - let val = self - .sparse_get(sparse_rel_dir_key, ctx) - .await - .map_err(|e| RelationError::Other(e.into()))?; + let val = self.sparse_get(sparse_rel_dir_key, ctx).await?; let val = RelDirExists::decode_option(val) - .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + .map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?; if val == RelDirExists::Exists { - return Err(RelationError::AlreadyExists); + Err(WalIngestErrorKind::RelationAlreadyExists(rel))?; } self.put( sparse_rel_dir_key, @@ -1996,9 +2009,7 @@ impl DatadirModification<'_> { // will be key not found errors if we don't create an empty one for rel_size_v2. self.put( rel_dir_key, - Value::Image(Bytes::from( - RelDirectory::ser(&RelDirectory::default()).context("serialize")?, - )), + Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)), ); } self.pending_directory_entries @@ -2006,7 +2017,7 @@ impl DatadirModification<'_> { } else { // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { - return Err(RelationError::AlreadyExists); + Err(WalIngestErrorKind::RelationAlreadyExists(rel))?; } if !dbdir_exists { self.pending_directory_entries @@ -2016,9 +2027,7 @@ impl DatadirModification<'_> { .push((DirectoryKind::Rel, MetricsUpdate::Add(1))); self.put( rel_dir_key, - Value::Image(Bytes::from( - RelDirectory::ser(&rel_dir).context("serialize")?, - )), + Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)), ); } @@ -2043,8 +2052,8 @@ impl DatadirModification<'_> { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); if self .tline .get_rel_exists(rel, Version::Modified(self), ctx) @@ -2074,8 +2083,8 @@ impl DatadirModification<'_> { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); // Put size let size_key = rel_size_to_key(rel); @@ -2099,8 +2108,10 @@ impl DatadirModification<'_> { &mut self, drop_relations: HashMap<(u32, u32), Vec>, ctx: &RequestContext, - ) -> anyhow::Result<()> { - let v2_enabled = self.maybe_enable_rel_size_v2()?; + ) -> Result<(), WalIngestError> { + let v2_enabled = self + .maybe_enable_rel_size_v2() + .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; for ((spc_node, db_node), rel_tags) in drop_relations { let dir_key = rel_dir_to_key(spc_node, db_node); let buf = self.get(dir_key, ctx).await?; @@ -2120,7 +2131,7 @@ impl DatadirModification<'_> { let key = rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum); let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?) - .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + .map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?; if val == RelDirExists::Exists { self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1))); @@ -2163,7 +2174,7 @@ impl DatadirModification<'_> { segno: u32, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); // Add it to the directory entry @@ -2172,7 +2183,7 @@ impl DatadirModification<'_> { let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.insert(segno) { - anyhow::bail!("slru segment {kind:?}/{segno} already exists"); + Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?; } self.pending_directory_entries.push(( DirectoryKind::SlruSegment(kind), @@ -2199,7 +2210,7 @@ impl DatadirModification<'_> { kind: SlruKind, segno: u32, nblocks: BlockNumber, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); // Put size @@ -2215,7 +2226,7 @@ impl DatadirModification<'_> { kind: SlruKind, segno: u32, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // Remove it from the directory entry let dir_key = slru_dir_to_key(kind); let buf = self.get(dir_key, ctx).await?; @@ -2240,7 +2251,7 @@ impl DatadirModification<'_> { } /// Drop a relmapper file (pg_filenode.map) - pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> { + pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> { // TODO Ok(()) } @@ -2250,7 +2261,7 @@ impl DatadirModification<'_> { &mut self, xid: u64, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // Remove it from the directory entry let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; let newdirbuf = if self.tline.pg_version >= 17 { @@ -2265,7 +2276,8 @@ impl DatadirModification<'_> { )); Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) } else { - let xid: u32 = u32::try_from(xid)?; + let xid: u32 = u32::try_from(xid) + .map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.remove(&xid) { @@ -2290,7 +2302,7 @@ impl DatadirModification<'_> { path: &str, content: &[u8], ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let key = aux_file::encode_aux_file_key(path); // retrieve the key from the engine let old_val = match self.get(key, ctx).await { @@ -2299,7 +2311,7 @@ impl DatadirModification<'_> { Err(e) => return Err(e.into()), }; let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val { - aux_file::decode_file_value(old_val)? + aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)? } else { Vec::new() }; @@ -2344,7 +2356,8 @@ impl DatadirModification<'_> { } (None, true) => warn!("removing non-existing aux file: {}", path), } - let new_val = aux_file::encode_file_value(&new_files)?; + let new_val = aux_file::encode_file_value(&new_files) + .map_err(WalIngestErrorKind::EncodeAuxFileError)?; self.put(key, Value::Image(new_val.into())); Ok(()) diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 9cc604f86d..d4873e60a1 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -219,8 +219,7 @@ pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker"); pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker"); pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker"); // Bump this number when adding a new pageserver_runtime! -// SAFETY: it's obviously correct -const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) }; +const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = NonZeroUsize::new(4).unwrap(); #[derive(Debug, Clone, Copy)] pub struct PageserverTaskId(u64); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e7d8ed75ed..0ba70f45b2 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -45,6 +45,7 @@ use remote_timeline_client::manifest::{ }; use remote_timeline_client::{ FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD, UploadQueueNotReadyError, + download_tenant_manifest, }; use secondary::heatmap::{HeatMapTenant, HeatMapTimeline}; use storage_broker::BrokerClientChannel; @@ -99,7 +100,7 @@ use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; use crate::walingest::WalLagCooldown; -use crate::walredo::PostgresRedoManager; +use crate::walredo::{PostgresRedoManager, RedoAttemptType}; use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mgr, walredo}; static INIT_DB_SEMAPHORE: Lazy = Lazy::new(|| Semaphore::new(8)); @@ -226,7 +227,8 @@ struct TimelinePreload { } pub(crate) struct TenantPreload { - tenant_manifest: TenantManifest, + /// The tenant manifest from remote storage, or None if no manifest was found. + tenant_manifest: Option, /// Map from timeline ID to a possible timeline preload. It is None iff the timeline is offloaded according to the manifest. timelines: HashMap>, } @@ -282,12 +284,15 @@ pub struct Tenant { /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating` timelines_offloaded: Mutex>>, - /// Serialize writes of the tenant manifest to remote storage. If there are concurrent operations - /// affecting the manifest, such as timeline deletion and timeline offload, they must wait for - /// each other (this could be optimized to coalesce writes if necessary). + /// The last tenant manifest known to be in remote storage. None if the manifest has not yet + /// been either downloaded or uploaded. Always Some after tenant attach. /// - /// The contents of the Mutex are the last manifest we successfully uploaded - tenant_manifest_upload: tokio::sync::Mutex>, + /// Initially populated during tenant attach, updated via `maybe_upload_tenant_manifest`. + /// + /// Do not modify this directly. It is used to check whether a new manifest needs to be + /// uploaded. The manifest is constructed in `build_tenant_manifest`, and uploaded via + /// `maybe_upload_tenant_manifest`. + remote_tenant_manifest: tokio::sync::Mutex>, // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding @@ -468,15 +473,16 @@ impl WalRedoManager { base_img: Option<(Lsn, bytes::Bytes)>, records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>, pg_version: u32, + redo_attempt_type: RedoAttemptType, ) -> Result { match self { Self::Prod(_, mgr) => { - mgr.request_redo(key, lsn, base_img, records, pg_version) + mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type) .await } #[cfg(test)] Self::Test(mgr) => { - mgr.request_redo(key, lsn, base_img, records, pg_version) + mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type) .await } } @@ -915,6 +921,7 @@ enum StartCreatingTimelineResult { Idempotent(Arc), } +#[allow(clippy::large_enum_variant, reason = "TODO")] enum TimelineInitAndSyncResult { ReadyToActivate(Arc), NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata), @@ -1001,6 +1008,7 @@ enum CreateTimelineCause { Delete, } +#[allow(clippy::large_enum_variant, reason = "TODO")] enum LoadTimelineCause { Attach, Unoffload, @@ -1354,36 +1362,41 @@ impl Tenant { } } - // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state. - enum BrokenVerbosity { - Error, - Info - } - let make_broken = - |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| { - match verbosity { - BrokenVerbosity::Info => { - info!("attach cancelled, setting tenant state to Broken: {err}"); - }, - BrokenVerbosity::Error => { - error!("attach failed, setting tenant state to Broken: {err:?}"); - } + fn make_broken_or_stopping(t: &Tenant, err: anyhow::Error) { + t.state.send_modify(|state| match state { + // TODO: the old code alluded to DeleteTenantFlow sometimes setting + // TenantState::Stopping before we get here, but this may be outdated. + // Let's find out with a testing assertion. If this doesn't fire, and the + // logs don't show this happening in production, remove the Stopping cases. + TenantState::Stopping{..} if cfg!(any(test, feature = "testing")) => { + panic!("unexpected TenantState::Stopping during attach") } - t.state.send_modify(|state| { - // The Stopping case is for when we have passed control on to DeleteTenantFlow: - // if it errors, we will call make_broken when tenant is already in Stopping. - assert!( - matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }), - "the attach task owns the tenant state until activation is complete" - ); - - *state = TenantState::broken_from_reason(err.to_string()); - }); - }; + // If the tenant is cancelled, assume the error was caused by cancellation. + TenantState::Attaching if t.cancel.is_cancelled() => { + info!("attach cancelled, setting tenant state to Stopping: {err}"); + // NB: progress None tells `set_stopping` that attach has cancelled. + *state = TenantState::Stopping { progress: None }; + } + // According to the old code, DeleteTenantFlow may already have set this to + // Stopping. Retain its progress. + // TODO: there is no DeleteTenantFlow. Is this still needed? See above. + TenantState::Stopping { progress } if t.cancel.is_cancelled() => { + assert!(progress.is_some(), "concurrent attach cancellation"); + info!("attach cancelled, already Stopping: {err}"); + } + // Mark the tenant as broken. + TenantState::Attaching | TenantState::Stopping { .. } => { + error!("attach failed, setting tenant state to Broken (was {state}): {err:?}"); + *state = TenantState::broken_from_reason(err.to_string()) + } + // The attach task owns the tenant state until activated. + state => panic!("invalid tenant state {state} during attach: {err:?}"), + }); + } // TODO: should also be rejecting tenant conf changes that violate this check. if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) { - make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); + make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e)); return Ok(()); } @@ -1435,10 +1448,8 @@ impl Tenant { // stayed in Activating for such a long time that shutdown found it in // that state. tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation"); - // Make the tenant broken so that set_stopping will not hang waiting for it to leave - // the Attaching state. This is an over-reaction (nothing really broke, the tenant is - // just shutting down), but ensures progress. - make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info); + // Set the tenant to Stopping to signal `set_stopping` that we're done. + make_broken_or_stopping(&tenant_clone, anyhow::anyhow!("Shut down while Attaching")); return Ok(()); }, ) @@ -1457,7 +1468,7 @@ impl Tenant { match res { Ok(p) => Some(p), Err(e) => { - make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); + make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e)); return Ok(()); } } @@ -1483,9 +1494,7 @@ impl Tenant { info!("attach finished, activating"); tenant_clone.activate(broker_client, None, &ctx); } - Err(e) => { - make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); - } + Err(e) => make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e)), } // If we are doing an opportunistic warmup attachment at startup, initialize @@ -1525,28 +1534,27 @@ impl Tenant { cancel.clone(), ) .await?; - let (offloaded_add, tenant_manifest) = - match remote_timeline_client::download_tenant_manifest( - remote_storage, - &self.tenant_shard_id, - self.generation, - &cancel, - ) - .await - { - Ok((tenant_manifest, _generation, _manifest_mtime)) => ( - format!("{} offloaded", tenant_manifest.offloaded_timelines.len()), - tenant_manifest, - ), - Err(DownloadError::NotFound) => { - ("no manifest".to_string(), TenantManifest::empty()) - } - Err(e) => Err(e)?, - }; + + let tenant_manifest = match download_tenant_manifest( + remote_storage, + &self.tenant_shard_id, + self.generation, + &cancel, + ) + .await + { + Ok((tenant_manifest, _, _)) => Some(tenant_manifest), + Err(DownloadError::NotFound) => None, + Err(err) => return Err(err.into()), + }; info!( - "found {} timelines, and {offloaded_add}", - remote_timeline_ids.len() + "found {} timelines ({} offloaded timelines)", + remote_timeline_ids.len(), + tenant_manifest + .as_ref() + .map(|m| m.offloaded_timelines.len()) + .unwrap_or(0) ); for k in other_keys { @@ -1555,11 +1563,13 @@ impl Tenant { // Avoid downloading IndexPart of offloaded timelines. let mut offloaded_with_prefix = HashSet::new(); - for offloaded in tenant_manifest.offloaded_timelines.iter() { - if remote_timeline_ids.remove(&offloaded.timeline_id) { - offloaded_with_prefix.insert(offloaded.timeline_id); - } else { - // We'll take care later of timelines in the manifest without a prefix + if let Some(tenant_manifest) = &tenant_manifest { + for offloaded in tenant_manifest.offloaded_timelines.iter() { + if remote_timeline_ids.remove(&offloaded.timeline_id) { + offloaded_with_prefix.insert(offloaded.timeline_id); + } else { + // We'll take care later of timelines in the manifest without a prefix + } } } @@ -1633,12 +1643,14 @@ impl Tenant { let mut offloaded_timeline_ids = HashSet::new(); let mut offloaded_timelines_list = Vec::new(); - for timeline_manifest in preload.tenant_manifest.offloaded_timelines.iter() { - let timeline_id = timeline_manifest.timeline_id; - let offloaded_timeline = - OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest); - offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline))); - offloaded_timeline_ids.insert(timeline_id); + if let Some(tenant_manifest) = &preload.tenant_manifest { + for timeline_manifest in tenant_manifest.offloaded_timelines.iter() { + let timeline_id = timeline_manifest.timeline_id; + let offloaded_timeline = + OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest); + offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline))); + offloaded_timeline_ids.insert(timeline_id); + } } // Complete deletions for offloaded timeline id's from manifest. // The manifest will be uploaded later in this function. @@ -1796,15 +1808,21 @@ impl Tenant { .context("resume_deletion") .map_err(LoadLocalTimelineError::ResumeDeletion)?; } - let needs_manifest_upload = - offloaded_timelines_list.len() != preload.tenant_manifest.offloaded_timelines.len(); { let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap(); offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter()); } - if needs_manifest_upload { - self.store_tenant_manifest().await?; + + // Stash the preloaded tenant manifest, and upload a new manifest if changed. + // + // NB: this must happen after the tenant is fully populated above. In particular the + // offloaded timelines, which are included in the manifest. + { + let mut guard = self.remote_tenant_manifest.lock().await; + assert!(guard.is_none(), "tenant manifest set before preload"); // first populated here + *guard = preload.tenant_manifest; } + self.maybe_upload_tenant_manifest().await?; // The local filesystem contents are a cache of what's in the remote IndexPart; // IndexPart is the source of truth. @@ -2218,7 +2236,7 @@ impl Tenant { }; // Upload new list of offloaded timelines to S3 - self.store_tenant_manifest().await?; + self.maybe_upload_tenant_manifest().await?; // Activate the timeline (if it makes sense) if !(timeline.is_broken() || timeline.is_stopping()) { @@ -3080,6 +3098,7 @@ impl Tenant { let mut has_pending_l0 = false; for timeline in compact_l0 { let ctx = &ctx.with_scope_timeline(&timeline); + // NB: don't set CompactFlags::YieldForL0, since this is an L0-only compaction pass. let outcome = timeline .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx) .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) @@ -3097,14 +3116,9 @@ impl Tenant { } } - // Pass 2: image compaction and timeline offloading. If any timelines have accumulated - // more L0 layers, they may also be compacted here. - // - // NB: image compaction may yield if there is pending L0 compaction. - // - // TODO: it will only yield if there is pending L0 compaction on the same timeline. If a - // different timeline needs compaction, it won't. It should check `l0_compaction_trigger`. - // We leave this for a later PR. + // Pass 2: image compaction and timeline offloading. If any timelines have accumulated more + // L0 layers, they may also be compacted here. Image compaction will yield if there is + // pending L0 compaction on any tenant timeline. // // TODO: consider ordering timelines by some priority, e.g. time since last full compaction, // amount of L1 delta debt or garbage, offload-eligible timelines first, etc. @@ -3115,8 +3129,14 @@ impl Tenant { } let ctx = &ctx.with_scope_timeline(&timeline); + // Yield for L0 if the separate L0 pass is enabled (otherwise there's no point). + let mut flags = EnumSet::default(); + if self.get_compaction_l0_first() { + flags |= CompactFlags::YieldForL0; + } + let mut outcome = timeline - .compact(cancel, EnumSet::default(), ctx) + .compact(cancel, flags, ctx) .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) .await .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?; @@ -3246,17 +3266,23 @@ impl Tenant { async fn housekeeping(&self) { // Call through to all timelines to freeze ephemeral layers as needed. This usually happens // during ingest, but we don't want idle timelines to hold open layers for too long. - let timelines = self - .timelines - .lock() - .unwrap() - .values() - .filter(|tli| tli.is_active()) - .cloned() - .collect_vec(); + // + // We don't do this if the tenant can't upload layers (i.e. it's in stale attachment mode). + // We don't run compaction in this case either, and don't want to keep flushing tiny L0 + // layers that won't be compacted down. + if self.tenant_conf.load().location.may_upload_layers_hint() { + let timelines = self + .timelines + .lock() + .unwrap() + .values() + .filter(|tli| tli.is_active()) + .cloned() + .collect_vec(); - for timeline in timelines { - timeline.maybe_freeze_ephemeral_layer().await; + for timeline in timelines { + timeline.maybe_freeze_ephemeral_layer().await; + } } // Shut down walredo if idle. @@ -3421,7 +3447,7 @@ impl Tenant { shutdown_mode }; - match self.set_stopping(shutdown_progress, false, false).await { + match self.set_stopping(shutdown_progress).await { Ok(()) => {} Err(SetStoppingError::Broken) => { // assume that this is acceptable @@ -3501,25 +3527,13 @@ impl Tenant { /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state. /// /// This function is not cancel-safe! - /// - /// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant. - /// `allow_transition_from_attaching` is needed for the special case of attaching deleted tenant. - async fn set_stopping( - &self, - progress: completion::Barrier, - _allow_transition_from_loading: bool, - allow_transition_from_attaching: bool, - ) -> Result<(), SetStoppingError> { + async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> { let mut rx = self.state.subscribe(); // cannot stop before we're done activating, so wait out until we're done activating rx.wait_for(|state| match state { - TenantState::Attaching if allow_transition_from_attaching => true, TenantState::Activating(_) | TenantState::Attaching => { - info!( - "waiting for {} to turn Active|Broken|Stopping", - <&'static str>::from(state) - ); + info!("waiting for {state} to turn Active|Broken|Stopping"); false } TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true, @@ -3530,25 +3544,24 @@ impl Tenant { // we now know we're done activating, let's see whether this task is the winner to transition into Stopping let mut err = None; let stopping = self.state.send_if_modified(|current_state| match current_state { - TenantState::Activating(_) => { - unreachable!("1we ensured above that we're done with activation, and, there is no re-activation") - } - TenantState::Attaching => { - if !allow_transition_from_attaching { - unreachable!("2we ensured above that we're done with activation, and, there is no re-activation") - }; - *current_state = TenantState::Stopping { progress }; - true + TenantState::Activating(_) | TenantState::Attaching => { + unreachable!("we ensured above that we're done with activation, and, there is no re-activation") } TenantState::Active => { // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines // are created after the transition to Stopping. That's harmless, as the Timelines // won't be accessible to anyone afterwards, because the Tenant is in Stopping state. - *current_state = TenantState::Stopping { progress }; + *current_state = TenantState::Stopping { progress: Some(progress) }; // Continue stopping outside the closure. We need to grab timelines.lock() // and we plan to turn it into a tokio::sync::Mutex in a future patch. true } + TenantState::Stopping { progress: None } => { + // An attach was cancelled, and the attach transitioned the tenant from Attaching to + // Stopping(None) to let us know it exited. Register our progress and continue. + *current_state = TenantState::Stopping { progress: Some(progress) }; + true + } TenantState::Broken { reason, .. } => { info!( "Cannot set tenant to Stopping state, it is in Broken state due to: {reason}" @@ -3556,7 +3569,7 @@ impl Tenant { err = Some(SetStoppingError::Broken); false } - TenantState::Stopping { progress } => { + TenantState::Stopping { progress: Some(progress) } => { info!("Tenant is already in Stopping state"); err = Some(SetStoppingError::AlreadyStopping(progress.clone())); false @@ -3681,7 +3694,7 @@ impl Tenant { } } } - TenantState::Active { .. } => { + TenantState::Active => { return Ok(()); } TenantState::Broken { reason, .. } => { @@ -4057,18 +4070,20 @@ impl Tenant { /// Generate an up-to-date TenantManifest based on the state of this Tenant. fn build_tenant_manifest(&self) -> TenantManifest { - let timelines_offloaded = self.timelines_offloaded.lock().unwrap(); - - let mut timeline_manifests = timelines_offloaded - .iter() - .map(|(_timeline_id, offloaded)| offloaded.manifest()) - .collect::>(); - // Sort the manifests so that our output is deterministic - timeline_manifests.sort_by_key(|timeline_manifest| timeline_manifest.timeline_id); + // Collect the offloaded timelines, and sort them for deterministic output. + let offloaded_timelines = self + .timelines_offloaded + .lock() + .unwrap() + .values() + .map(|tli| tli.manifest()) + .sorted_by_key(|m| m.timeline_id) + .collect_vec(); TenantManifest { version: LATEST_TENANT_MANIFEST_VERSION, - offloaded_timelines: timeline_manifests, + stripe_size: Some(self.get_shard_stripe_size()), + offloaded_timelines, } } @@ -4197,9 +4212,9 @@ impl Tenant { self.cancel.child_token(), ); - let timeline_ctx = RequestContextBuilder::extend(ctx) + let timeline_ctx = RequestContextBuilder::from(ctx) .scope(context::Scope::new_timeline(&timeline)) - .build(); + .detached_child(); Ok((timeline, timeline_ctx)) } @@ -4291,7 +4306,7 @@ impl Tenant { timelines: Mutex::new(HashMap::new()), timelines_creating: Mutex::new(HashSet::new()), timelines_offloaded: Mutex::new(HashMap::new()), - tenant_manifest_upload: Default::default(), + remote_tenant_manifest: Default::default(), gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, remote_storage, @@ -4387,10 +4402,7 @@ impl Tenant { .to_string(); fail::fail_point!("tenant-config-before-write", |_| { - Err(std::io::Error::new( - std::io::ErrorKind::Other, - "tenant-config-before-write", - )) + Err(std::io::Error::other("tenant-config-before-write")) }); // Convert the config to a toml file. @@ -5524,27 +5536,35 @@ impl Tenant { .unwrap_or(0) } - /// Serialize and write the latest TenantManifest to remote storage. - pub(crate) async fn store_tenant_manifest(&self) -> Result<(), TenantManifestError> { - // Only one manifest write may be done at at time, and the contents of the manifest - // must be loaded while holding this lock. This makes it safe to call this function - // from anywhere without worrying about colliding updates. + /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant + /// manifest in `Self::remote_tenant_manifest`. + /// + /// TODO: instead of requiring callers to remember to call `maybe_upload_tenant_manifest` after + /// changing any `Tenant` state that's included in the manifest, consider making the manifest + /// the authoritative source of data with an API that automatically uploads on changes. Revisit + /// this when the manifest is more widely used and we have a better idea of the data model. + pub(crate) async fn maybe_upload_tenant_manifest(&self) -> Result<(), TenantManifestError> { + // Multiple tasks may call this function concurrently after mutating the Tenant runtime + // state, affecting the manifest generated by `build_tenant_manifest`. We use an async mutex + // to serialize these callers. `eq_ignoring_version` acts as a slightly inefficient but + // simple coalescing mechanism. let mut guard = tokio::select! { - g = self.tenant_manifest_upload.lock() => { - g - }, - _ = self.cancel.cancelled() => { - return Err(TenantManifestError::Cancelled); - } + guard = self.remote_tenant_manifest.lock() => guard, + _ = self.cancel.cancelled() => return Err(TenantManifestError::Cancelled), }; + // Build a new manifest. let manifest = self.build_tenant_manifest(); - if Some(&manifest) == (*guard).as_ref() { - // Optimisation: skip uploads that don't change anything. - return Ok(()); + + // Check if the manifest has changed. We ignore the version number here, to avoid + // uploading every manifest on version number bumps. + if let Some(old) = guard.as_ref() { + if manifest.eq_ignoring_version(old) { + return Ok(()); + } } - // Remote storage does no retries internally, so wrap it + // Upload the manifest. Remote storage does no retries internally, so retry here. match backoff::retry( || async { upload_tenant_manifest( @@ -5556,7 +5576,7 @@ impl Tenant { ) .await }, - |_e| self.cancel.is_cancelled(), + |_| self.cancel.is_cancelled(), FAILED_UPLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "uploading tenant manifest", @@ -5860,6 +5880,7 @@ pub(crate) mod harness { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, _pg_version: u32, + _redo_attempt_type: RedoAttemptType, ) -> Result { let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1)); if records_neon { @@ -5912,12 +5933,20 @@ mod tests { use models::CompactLsnRange; use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; use pageserver_api::keyspace::KeySpace; + #[cfg(feature = "testing")] + use pageserver_api::keyspace::KeySpaceRandomAccum; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; #[cfg(feature = "testing")] use pageserver_api::record::NeonWalRecord; use pageserver_api::value::Value; use pageserver_compaction::helpers::overlaps_with; + #[cfg(feature = "testing")] + use rand::SeedableRng; + #[cfg(feature = "testing")] + use rand::rngs::StdRng; use rand::{Rng, thread_rng}; + #[cfg(feature = "testing")] + use std::ops::Range; use storage_layer::{IoConcurrency, PersistentLayerKey}; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; @@ -5927,7 +5956,7 @@ mod tests { use timeline::InMemoryLayerTestDesc; #[cfg(feature = "testing")] use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; - use timeline::{CompactOptions, DeltaLayerTestDesc}; + use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery}; use utils::id::TenantId; use super::*; @@ -5939,6 +5968,318 @@ mod tests { static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); + #[cfg(feature = "testing")] + struct TestTimelineSpecification { + start_lsn: Lsn, + last_record_lsn: Lsn, + + in_memory_layers_shape: Vec<(Range, Range)>, + delta_layers_shape: Vec<(Range, Range)>, + image_layers_shape: Vec<(Range, Lsn)>, + + gap_chance: u8, + will_init_chance: u8, + } + + #[cfg(feature = "testing")] + struct Storage { + storage: HashMap<(Key, Lsn), Value>, + start_lsn: Lsn, + } + + #[cfg(feature = "testing")] + impl Storage { + fn get(&self, key: Key, lsn: Lsn) -> Bytes { + use bytes::BufMut; + + let mut crnt_lsn = lsn; + let mut got_base = false; + + let mut acc = Vec::new(); + + while crnt_lsn >= self.start_lsn { + if let Some(value) = self.storage.get(&(key, crnt_lsn)) { + acc.push(value.clone()); + + match value { + Value::WalRecord(NeonWalRecord::Test { will_init, .. }) => { + if *will_init { + got_base = true; + break; + } + } + Value::Image(_) => { + got_base = true; + break; + } + _ => unreachable!(), + } + } + + crnt_lsn = crnt_lsn.checked_sub(1u64).unwrap(); + } + + assert!( + got_base, + "Input data was incorrect. No base image for {key}@{lsn}" + ); + + tracing::debug!("Wal redo depth for {key}@{lsn} is {}", acc.len()); + + let mut blob = BytesMut::new(); + for value in acc.into_iter().rev() { + match value { + Value::WalRecord(NeonWalRecord::Test { append, .. }) => { + blob.extend_from_slice(append.as_bytes()); + } + Value::Image(img) => { + blob.put(img); + } + _ => unreachable!(), + } + } + + blob.into() + } + } + + #[cfg(feature = "testing")] + #[allow(clippy::too_many_arguments)] + async fn randomize_timeline( + tenant: &Arc, + new_timeline_id: TimelineId, + pg_version: u32, + spec: TestTimelineSpecification, + random: &mut rand::rngs::StdRng, + ctx: &RequestContext, + ) -> anyhow::Result<(Arc, Storage, Vec)> { + let mut storage: HashMap<(Key, Lsn), Value> = HashMap::default(); + let mut interesting_lsns = vec![spec.last_record_lsn]; + + for (key_range, lsn_range) in spec.in_memory_layers_shape.iter() { + let mut lsn = lsn_range.start; + while lsn < lsn_range.end { + let mut key = key_range.start; + while key < key_range.end { + let gap = random.gen_range(1..=100) <= spec.gap_chance; + let will_init = random.gen_range(1..=100) <= spec.will_init_chance; + + if gap { + continue; + } + + let record = if will_init { + Value::WalRecord(NeonWalRecord::wal_init(format!("[wil_init {key}@{lsn}]"))) + } else { + Value::WalRecord(NeonWalRecord::wal_append(format!("[delta {key}@{lsn}]"))) + }; + + storage.insert((key, lsn), record); + + key = key.next(); + } + lsn = Lsn(lsn.0 + 1); + } + + // Stash some interesting LSN for future use + for offset in [0, 5, 100].iter() { + if *offset == 0 { + interesting_lsns.push(lsn_range.start); + } else { + let below = lsn_range.start.checked_sub(*offset); + match below { + Some(v) if v >= spec.start_lsn => { + interesting_lsns.push(v); + } + _ => {} + } + + let above = Lsn(lsn_range.start.0 + offset); + interesting_lsns.push(above); + } + } + } + + for (key_range, lsn_range) in spec.delta_layers_shape.iter() { + let mut lsn = lsn_range.start; + while lsn < lsn_range.end { + let mut key = key_range.start; + while key < key_range.end { + let gap = random.gen_range(1..=100) <= spec.gap_chance; + let will_init = random.gen_range(1..=100) <= spec.will_init_chance; + + if gap { + continue; + } + + let record = if will_init { + Value::WalRecord(NeonWalRecord::wal_init(format!("[wil_init {key}@{lsn}]"))) + } else { + Value::WalRecord(NeonWalRecord::wal_append(format!("[delta {key}@{lsn}]"))) + }; + + storage.insert((key, lsn), record); + + key = key.next(); + } + lsn = Lsn(lsn.0 + 1); + } + + // Stash some interesting LSN for future use + for offset in [0, 5, 100].iter() { + if *offset == 0 { + interesting_lsns.push(lsn_range.start); + } else { + let below = lsn_range.start.checked_sub(*offset); + match below { + Some(v) if v >= spec.start_lsn => { + interesting_lsns.push(v); + } + _ => {} + } + + let above = Lsn(lsn_range.start.0 + offset); + interesting_lsns.push(above); + } + } + } + + for (key_range, lsn) in spec.image_layers_shape.iter() { + let mut key = key_range.start; + while key < key_range.end { + let blob = Bytes::from(format!("[image {key}@{lsn}]")); + let record = Value::Image(blob.clone()); + storage.insert((key, *lsn), record); + + key = key.next(); + } + + // Stash some interesting LSN for future use + for offset in [0, 5, 100].iter() { + if *offset == 0 { + interesting_lsns.push(*lsn); + } else { + let below = lsn.checked_sub(*offset); + match below { + Some(v) if v >= spec.start_lsn => { + interesting_lsns.push(v); + } + _ => {} + } + + let above = Lsn(lsn.0 + offset); + interesting_lsns.push(above); + } + } + } + + let in_memory_test_layers = { + let mut acc = Vec::new(); + + for (key_range, lsn_range) in spec.in_memory_layers_shape.iter() { + let mut data = Vec::new(); + + let mut lsn = lsn_range.start; + while lsn < lsn_range.end { + let mut key = key_range.start; + while key < key_range.end { + if let Some(record) = storage.get(&(key, lsn)) { + data.push((key, lsn, record.clone())); + } + + key = key.next(); + } + lsn = Lsn(lsn.0 + 1); + } + + acc.push(InMemoryLayerTestDesc { + data, + lsn_range: lsn_range.clone(), + is_open: false, + }) + } + + acc + }; + + let delta_test_layers = { + let mut acc = Vec::new(); + + for (key_range, lsn_range) in spec.delta_layers_shape.iter() { + let mut data = Vec::new(); + + let mut lsn = lsn_range.start; + while lsn < lsn_range.end { + let mut key = key_range.start; + while key < key_range.end { + if let Some(record) = storage.get(&(key, lsn)) { + data.push((key, lsn, record.clone())); + } + + key = key.next(); + } + lsn = Lsn(lsn.0 + 1); + } + + acc.push(DeltaLayerTestDesc { + data, + lsn_range: lsn_range.clone(), + key_range: key_range.clone(), + }) + } + + acc + }; + + let image_test_layers = { + let mut acc = Vec::new(); + + for (key_range, lsn) in spec.image_layers_shape.iter() { + let mut data = Vec::new(); + + let mut key = key_range.start; + while key < key_range.end { + if let Some(record) = storage.get(&(key, *lsn)) { + let blob = match record { + Value::Image(blob) => blob.clone(), + _ => unreachable!(), + }; + + data.push((key, blob)); + } + + key = key.next(); + } + + acc.push((*lsn, data)); + } + + acc + }; + + let tline = tenant + .create_test_timeline_with_layers( + new_timeline_id, + spec.start_lsn, + pg_version, + ctx, + in_memory_test_layers, + delta_test_layers, + image_test_layers, + spec.last_record_lsn, + ) + .await?; + + Ok(( + tline, + Storage { + storage, + start_lsn: spec.start_lsn, + }, + interesting_lsns, + )) + } + #[tokio::test] async fn test_basic() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await; @@ -6516,11 +6857,7 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact( - &CancellationToken::new(), - CompactFlags::NoYield.into(), - &ctx, - ) + .compact(&CancellationToken::new(), EnumSet::default(), &ctx) .await?; let mut writer = tline.writer().await; @@ -6537,11 +6874,7 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact( - &CancellationToken::new(), - CompactFlags::NoYield.into(), - &ctx, - ) + .compact(&CancellationToken::new(), EnumSet::default(), &ctx) .await?; let mut writer = tline.writer().await; @@ -6558,11 +6891,7 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact( - &CancellationToken::new(), - CompactFlags::NoYield.into(), - &ctx, - ) + .compact(&CancellationToken::new(), EnumSet::default(), &ctx) .await?; let mut writer = tline.writer().await; @@ -6579,11 +6908,7 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact( - &CancellationToken::new(), - CompactFlags::NoYield.into(), - &ctx, - ) + .compact(&CancellationToken::new(), EnumSet::default(), &ctx) .await?; assert_eq!( @@ -6666,9 +6991,7 @@ mod tests { timeline.freeze_and_flush().await?; if compact { // this requires timeline to be &Arc - timeline - .compact(&cancel, CompactFlags::NoYield.into(), ctx) - .await?; + timeline.compact(&cancel, EnumSet::default(), ctx).await?; } // this doesn't really need to use the timeline_id target, but it is closer to what it @@ -6783,10 +7106,11 @@ mod tests { for read in reads { info!("Doing vectored read on {:?}", read); + let query = VersionedKeySpaceQuery::uniform(read.clone(), reads_lsn); + let vectored_res = tline .get_vectored_impl( - read.clone(), - reads_lsn, + query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) @@ -6865,10 +7189,11 @@ mod tests { }; let read_lsn = child_timeline.get_last_record_lsn(); + let query = VersionedKeySpaceQuery::uniform(aux_keyspace.clone(), read_lsn); + let vectored_res = child_timeline .get_vectored_impl( - aux_keyspace.clone(), - read_lsn, + query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) @@ -6995,7 +7320,6 @@ mod tests { child_timeline.freeze_and_flush().await?; let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceRepartition); - flags.insert(CompactFlags::NoYield); child_timeline .compact(&CancellationToken::new(), flags, &ctx) .await?; @@ -7015,10 +7339,12 @@ mod tests { let read = KeySpace { ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key], }; + + let query = VersionedKeySpaceQuery::uniform(read.clone(), current_lsn); + let results = child_timeline .get_vectored_impl( - read.clone(), - current_lsn, + query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) @@ -7149,12 +7475,16 @@ mod tests { } for query_lsn in query_lsns { + let query = VersionedKeySpaceQuery::uniform( + KeySpace { + ranges: vec![child_gap_at_key..child_gap_at_key.next()], + }, + query_lsn, + ); + let results = child_timeline .get_vectored_impl( - KeySpace { - ranges: vec![child_gap_at_key..child_gap_at_key.next()], - }, - query_lsn, + query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) @@ -7374,9 +7704,7 @@ mod tests { // Perform a cycle of flush, compact, and GC tline.freeze_and_flush().await?; - tline - .compact(&cancel, CompactFlags::NoYield.into(), &ctx) - .await?; + tline.compact(&cancel, EnumSet::default(), &ctx).await?; tenant .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; @@ -7655,10 +7983,11 @@ mod tests { } let mut cnt = 0; + let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn); + for (key, value) in tline .get_vectored_impl( - keyspace.clone(), - lsn, + query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) @@ -7705,7 +8034,6 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); - flags.insert(CompactFlags::NoYield); flags } else { EnumSet::empty() @@ -7756,9 +8084,7 @@ mod tests { let before_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); - tline - .compact(&cancel, CompactFlags::NoYield.into(), &ctx) - .await?; + tline.compact(&cancel, EnumSet::default(), &ctx).await?; let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); @@ -7868,8 +8194,9 @@ mod tests { io_concurrency: IoConcurrency, ) -> anyhow::Result<(BTreeMap>, usize)> { let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); + let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn); let res = tline - .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .get_vectored_impl(query, &mut reconstruct_state, ctx) .await?; Ok((res, reconstruct_state.get_delta_layers_visited() as usize)) } @@ -7923,7 +8250,6 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); - flags.insert(CompactFlags::NoYield); flags }, &ctx, @@ -8167,13 +8493,10 @@ mod tests { // test vectored scan on parent timeline let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); + let query = + VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn); let res = tline - .get_vectored_impl( - KeySpace::single(Key::metadata_key_range()), - lsn, - &mut reconstruct_state, - &ctx, - ) + .get_vectored_impl(query, &mut reconstruct_state, &ctx) .await?; assert_eq!( @@ -8193,13 +8516,10 @@ mod tests { // test vectored scan on child timeline let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); + let query = + VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn); let res = child - .get_vectored_impl( - KeySpace::single(Key::metadata_key_range()), - lsn, - &mut reconstruct_state, - &ctx, - ) + .get_vectored_impl(query, &mut reconstruct_state, &ctx) .await?; assert_eq!( @@ -8233,13 +8553,9 @@ mod tests { let io_concurrency = IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap()); let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); + let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn); let mut res = tline - .get_vectored_impl( - KeySpace::single(key..key.next()), - lsn, - &mut reconstruct_state, - ctx, - ) + .get_vectored_impl(query, &mut reconstruct_state, ctx) .await?; Ok(res.pop_last().map(|(k, v)| { assert_eq!(k, key); @@ -8386,7 +8702,6 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); - flags.insert(CompactFlags::NoYield); flags }, &ctx, @@ -8454,7 +8769,6 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); - flags.insert(CompactFlags::NoYield); flags }, &ctx, @@ -8741,6 +9055,21 @@ mod tests { Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_init("i")), ), + ( + get_key(4), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "i")), + ), + ( + get_key(5), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_init("1")), + ), + ( + get_key(5), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "2")), + ), ]; let image1 = vec![(get_key(1), "0x10".into())]; @@ -8771,8 +9100,18 @@ mod tests { // Need to remove the limit of "Neon WAL redo requires base image". - // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new()); - // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new()); + assert_eq!( + tline.get(get_key(3), Lsn(0x50), &ctx).await?, + Bytes::from_static(b"c") + ); + assert_eq!( + tline.get(get_key(4), Lsn(0x50), &ctx).await?, + Bytes::from_static(b"ij") + ); + + // Manual testing required: currently, read errors will panic the process in debug mode. So we + // cannot enable this assertion in the unit test. + // assert!(tline.get(get_key(5), Lsn(0x50), &ctx).await.is_err()); Ok(()) } @@ -9238,6 +9577,7 @@ mod tests { &[Lsn(0x20), Lsn(0x40), Lsn(0x50)], 3, None, + true, ) .await .unwrap(); @@ -9362,7 +9702,15 @@ mod tests { ), ]; let res = tline - .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None) + .generate_key_retention( + key, + &history, + Lsn(0x60), + &[Lsn(0x40), Lsn(0x50)], + 3, + None, + true, + ) .await .unwrap(); let expected_res = KeyHistoryRetention { @@ -9441,6 +9789,7 @@ mod tests { &[], 3, Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))), + true, ) .await .unwrap(); @@ -9489,6 +9838,7 @@ mod tests { &[Lsn(0x30)], 3, Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))), + true, ) .await .unwrap(); @@ -10339,14 +10689,13 @@ mod tests { ) .await?; - let keyspace = KeySpace::single(get_key(0)..get_key(10)); + let query = VersionedKeySpaceQuery::uniform( + KeySpace::single(get_key(0)..get_key(10)), + delta_layer_end_lsn, + ); + let results = tline - .get_vectored( - keyspace, - delta_layer_end_lsn, - IoConcurrency::sequential(), - &ctx, - ) + .get_vectored(query, IoConcurrency::sequential(), &ctx) .await .expect("No vectored errors"); for (key, res) in results { @@ -10494,9 +10843,13 @@ mod tests { ) .await?; - let keyspace = KeySpace::single(get_key(0)..get_key(10)); + let query = VersionedKeySpaceQuery::uniform( + KeySpace::single(get_key(0)..get_key(10)), + last_record_lsn, + ); + let results = tline - .get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx) + .get_vectored(query, IoConcurrency::sequential(), &ctx) .await .expect("No vectored errors"); for (key, res) in results { @@ -10510,6 +10863,214 @@ mod tests { Ok(()) } + // A randomized read path test. Generates a layer map according to a deterministic + // specification. Fills the (key, LSN) space in random manner and then performs + // random scattered queries validating the results against in-memory storage. + // + // See this internal Notion page for a diagram of the layer map: + // https://www.notion.so/neondatabase/Read-Path-Unit-Testing-Fuzzing-1d1f189e0047806c8e5cd37781b0a350?pvs=4 + // + // A fuzzing mode is also supported. In this mode, the test will use a random + // seed instead of a hardcoded one. Use it in conjunction with `cargo stress` + // to run multiple instances in parallel: + // + // $ RUST_BACKTRACE=1 RUST_LOG=INFO \ + // cargo stress --package=pageserver --features=testing,fuzz-read-path --release -- test_read_path + #[cfg(feature = "testing")] + #[tokio::test] + async fn test_read_path() -> anyhow::Result<()> { + use rand::seq::SliceRandom; + + let seed = if cfg!(feature = "fuzz-read-path") { + let seed: u64 = thread_rng().r#gen(); + seed + } else { + // Use a hard-coded seed when not in fuzzing mode. + // Note that with the current approach results are not reproducible + // accross platforms and Rust releases. + const SEED: u64 = 0; + SEED + }; + + let mut random = StdRng::seed_from_u64(seed); + + let (queries, will_init_chance, gap_chance) = if cfg!(feature = "fuzz-read-path") { + const QUERIES: u64 = 5000; + let will_init_chance: u8 = random.gen_range(0..=10); + let gap_chance: u8 = random.gen_range(0..=50); + + (QUERIES, will_init_chance, gap_chance) + } else { + const QUERIES: u64 = 1000; + const WILL_INIT_CHANCE: u8 = 1; + const GAP_CHANCE: u8 = 5; + + (QUERIES, WILL_INIT_CHANCE, GAP_CHANCE) + }; + + let harness = TenantHarness::create("test_read_path").await?; + let (tenant, ctx) = harness.load().await; + + tracing::info!("Using random seed: {seed}"); + tracing::info!(%will_init_chance, %gap_chance, "Fill params"); + + // Define the layer map shape. Note that this part is not randomized. + + const KEY_DIMENSION_SIZE: u32 = 99; + let start_key = Key::from_hex("110000000033333333444444445500000000").unwrap(); + let end_key = start_key.add(KEY_DIMENSION_SIZE); + let total_key_range = start_key..end_key; + let total_key_range_size = end_key.to_i128() - start_key.to_i128(); + let total_start_lsn = Lsn(104); + let last_record_lsn = Lsn(504); + + assert!(total_key_range_size % 3 == 0); + + let in_memory_layers_shape = vec![ + (total_key_range.clone(), Lsn(304)..Lsn(400)), + (total_key_range.clone(), Lsn(400)..last_record_lsn), + ]; + + let delta_layers_shape = vec![ + ( + start_key..(start_key.add((total_key_range_size / 3) as u32)), + Lsn(200)..Lsn(304), + ), + ( + (start_key.add((total_key_range_size / 3) as u32)) + ..(start_key.add((total_key_range_size * 2 / 3) as u32)), + Lsn(200)..Lsn(304), + ), + ( + (start_key.add((total_key_range_size * 2 / 3) as u32)) + ..(start_key.add(total_key_range_size as u32)), + Lsn(200)..Lsn(304), + ), + ]; + + let image_layers_shape = vec![ + ( + start_key.add((total_key_range_size * 2 / 3 - 10) as u32) + ..start_key.add((total_key_range_size * 2 / 3 + 10) as u32), + Lsn(456), + ), + ( + start_key.add((total_key_range_size / 3 - 10) as u32) + ..start_key.add((total_key_range_size / 3 + 10) as u32), + Lsn(256), + ), + (total_key_range.clone(), total_start_lsn), + ]; + + let specification = TestTimelineSpecification { + start_lsn: total_start_lsn, + last_record_lsn, + in_memory_layers_shape, + delta_layers_shape, + image_layers_shape, + gap_chance, + will_init_chance, + }; + + // Create and randomly fill in the layers according to the specification + let (tline, storage, interesting_lsns) = randomize_timeline( + &tenant, + TIMELINE_ID, + DEFAULT_PG_VERSION, + specification, + &mut random, + &ctx, + ) + .await?; + + // Now generate queries based on the interesting lsns that we've collected. + // + // While there's still room in the query, pick and interesting LSN and a random + // key. Then roll the dice to see if the next key should also be included in + // the query. When the roll fails, break the "batch" and pick another point in the + // (key, LSN) space. + + const PICK_NEXT_CHANCE: u8 = 50; + for _ in 0..queries { + let query = { + let mut keyspaces_at_lsn: HashMap = HashMap::default(); + let mut used_keys: HashSet = HashSet::default(); + + while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize { + let selected_lsn = interesting_lsns.choose(&mut random).expect("not empty"); + let mut selected_key = start_key.add(random.gen_range(0..KEY_DIMENSION_SIZE)); + + while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize { + if used_keys.contains(&selected_key) + || selected_key >= start_key.add(KEY_DIMENSION_SIZE) + { + break; + } + + keyspaces_at_lsn + .entry(*selected_lsn) + .or_default() + .add_key(selected_key); + used_keys.insert(selected_key); + + let pick_next = random.gen_range(0..=100) <= PICK_NEXT_CHANCE; + if pick_next { + selected_key = selected_key.next(); + } else { + break; + } + } + } + + VersionedKeySpaceQuery::scattered( + keyspaces_at_lsn + .into_iter() + .map(|(lsn, acc)| (lsn, acc.to_keyspace())) + .collect(), + ) + }; + + // Run the query and validate the results + + let results = tline + .get_vectored(query.clone(), IoConcurrency::Sequential, &ctx) + .await; + + let blobs = match results { + Ok(ok) => ok, + Err(err) => { + panic!("seed={seed} Error returned for query {query}: {err}"); + } + }; + + for (key, key_res) in blobs.into_iter() { + match key_res { + Ok(blob) => { + let requested_at_lsn = query.map_key_to_lsn(&key); + let expected = storage.get(key, requested_at_lsn); + + if blob != expected { + tracing::error!( + "seed={seed} Mismatch for {key}@{requested_at_lsn} from query: {query}" + ); + } + + assert_eq!(blob, expected); + } + Err(err) => { + let requested_at_lsn = query.map_key_to_lsn(&key); + + panic!( + "seed={seed} Error returned for {key}@{requested_at_lsn} from query {query}: {err}" + ); + } + } + } + } + + Ok(()) + } + fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering { ( k1.is_delta, @@ -11551,4 +12112,348 @@ mod tests { Ok(()) } + + #[cfg(feature = "testing")] + #[tokio::test] + async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(1), + Lsn(0x24), + Value::WalRecord(NeonWalRecord::wal_append("@0x24")), + ), + ( + get_key(1), + Lsn(0x28), + // This record will fail to redo + Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![], // in-memory layers + vec![DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + delta1, + )], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + tline + .applied_gc_cutoff_lsn + .lock_for_write() + .store_and_unlock(Lsn(0x30)) + .wait() + .await; + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![], + cutoffs: GcCutoffs { + time: Lsn(0x30), + space: Lsn(0x30), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let cancel = CancellationToken::new(); + + // Compaction will fail, but should not fire any critical error. + // Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction + // process. It will always try to redo the logs it reads and if it doesn't work, fail the entire + // compaction job. Tracked in . + let res = tline + .compact_with_gc( + &cancel, + CompactOptions { + compact_key_range: None, + compact_lsn_range: None, + ..Default::default() + }, + &ctx, + ) + .await; + assert!(res.is_err()); + + Ok(()) + } + + #[cfg(feature = "testing")] + #[tokio::test] + async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> { + use pageserver_api::models::TimelineVisibilityState; + + use crate::tenant::size::gather_inputs; + + let tenant_conf = pageserver_api::models::TenantConfig { + // Ensure that we don't compute gc_cutoffs (which needs reading the layer files) + pitr_interval: Some(Duration::ZERO), + ..Default::default() + }; + let harness = TenantHarness::create_custom( + "test_synthetic_size_calculation_with_invisible_branches", + tenant_conf, + TenantId::generate(), + ShardIdentity::unsharded(), + Generation::new(0xdeadbeef), + ) + .await?; + let (tenant, ctx) = harness.load().await; + let main_tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![], + vec![], + vec![], + Lsn(0x100), + ) + .await?; + + let snapshot1 = TimelineId::from_array(hex!("11223344556677881122334455667790")); + tenant + .branch_timeline_test_with_layers( + &main_tline, + snapshot1, + Some(Lsn(0x20)), + &ctx, + vec![], + vec![], + Lsn(0x50), + ) + .await?; + let snapshot2 = TimelineId::from_array(hex!("11223344556677881122334455667791")); + tenant + .branch_timeline_test_with_layers( + &main_tline, + snapshot2, + Some(Lsn(0x30)), + &ctx, + vec![], + vec![], + Lsn(0x50), + ) + .await?; + let snapshot3 = TimelineId::from_array(hex!("11223344556677881122334455667792")); + tenant + .branch_timeline_test_with_layers( + &main_tline, + snapshot3, + Some(Lsn(0x40)), + &ctx, + vec![], + vec![], + Lsn(0x50), + ) + .await?; + let limit = Arc::new(Semaphore::new(1)); + let max_retention_period = None; + let mut logical_size_cache = HashMap::new(); + let cause = LogicalSizeCalculationCause::EvictionTaskImitation; + let cancel = CancellationToken::new(); + + let inputs = gather_inputs( + &tenant, + &limit, + max_retention_period, + &mut logical_size_cache, + cause, + &cancel, + &ctx, + ) + .instrument(info_span!( + "gather_inputs", + tenant_id = "unknown", + shard_id = "unknown", + )) + .await?; + use crate::tenant::size::{LsnKind, ModelInputs, SegmentMeta}; + use LsnKind::*; + use tenant_size_model::Segment; + let ModelInputs { mut segments, .. } = inputs; + segments.retain(|s| s.timeline_id == TIMELINE_ID); + for segment in segments.iter_mut() { + segment.segment.parent = None; // We don't care about the parent for the test + segment.segment.size = None; // We don't care about the size for the test + } + assert_eq!( + segments, + [ + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x10, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchStart, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x20, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchPoint, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x30, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchPoint, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x40, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchPoint, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x100, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: GcCutOff, + }, // we need to retain everything above the last branch point + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x100, + size: None, + needed: true, + }, + timeline_id: TIMELINE_ID, + kind: BranchEnd, + }, + ] + ); + + main_tline + .remote_client + .schedule_index_upload_for_timeline_invisible_state( + TimelineVisibilityState::Invisible, + )?; + main_tline.remote_client.wait_completion().await?; + let inputs = gather_inputs( + &tenant, + &limit, + max_retention_period, + &mut logical_size_cache, + cause, + &cancel, + &ctx, + ) + .instrument(info_span!( + "gather_inputs", + tenant_id = "unknown", + shard_id = "unknown", + )) + .await?; + let ModelInputs { mut segments, .. } = inputs; + segments.retain(|s| s.timeline_id == TIMELINE_ID); + for segment in segments.iter_mut() { + segment.segment.parent = None; // We don't care about the parent for the test + segment.segment.size = None; // We don't care about the size for the test + } + assert_eq!( + segments, + [ + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x10, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchStart, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x20, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchPoint, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x30, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchPoint, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x40, + size: None, + needed: false, + }, + timeline_id: TIMELINE_ID, + kind: BranchPoint, + }, + SegmentMeta { + segment: Segment { + parent: None, + lsn: 0x40, // Branch end LSN == last branch point LSN + size: None, + needed: true, + }, + timeline_id: TIMELINE_ID, + kind: BranchEnd, + }, + ] + ); + Ok(()) + } } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index ff9a7e57b6..abeaa166a4 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -15,13 +15,14 @@ //! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! use std::cmp::min; -use std::io::{Error, ErrorKind}; +use std::io::Error; use async_compression::Level; use bytes::{BufMut, BytesMut}; use pageserver_api::models::ImageCompressionAlgorithm; use tokio::io::AsyncWriteExt; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; +use tokio_util::sync::CancellationToken; use tracing::warn; use crate::context::RequestContext; @@ -169,7 +170,13 @@ pub struct BlobWriter { } impl BlobWriter { - pub fn new(inner: VirtualFile, start_offset: u64) -> Self { + pub fn new( + inner: VirtualFile, + start_offset: u64, + _gate: &utils::sync::gate::Gate, + _cancel: CancellationToken, + _ctx: &RequestContext, + ) -> Self { Self { inner, offset: start_offset, @@ -331,10 +338,7 @@ impl BlobWriter { return ( ( io_buf.slice_len(), - Err(Error::new( - ErrorKind::Other, - format!("blob too large ({len} bytes)"), - )), + Err(Error::other(format!("blob too large ({len} bytes)"))), ), srcbuf, ); @@ -435,12 +439,14 @@ pub(crate) mod tests { ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec), Error> { let temp_dir = camino_tempfile::tempdir()?; let pathbuf = temp_dir.path().join("file"); + let gate = utils::sync::gate::Gate::default(); + let cancel = CancellationToken::new(); // Write part (in block to drop the file) let mut offsets = Vec::new(); { let file = VirtualFile::create(pathbuf.as_path(), ctx).await?; - let mut wtr = BlobWriter::::new(file, 0); + let mut wtr = BlobWriter::::new(file, 0, &gate, cancel.clone(), ctx); for blob in blobs.iter() { let (_, res) = if compression { let res = wtr diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 66c586daff..6723155626 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -216,12 +216,8 @@ impl<'a> FileBlockReader<'a> { match cache .read_immutable_buf(self.file_id, blknum, ctx) .await - .map_err(|e| { - std::io::Error::new( - std::io::ErrorKind::Other, - format!("Failed to read immutable buf: {e:#}"), - ) - })? { + .map_err(|e| std::io::Error::other(format!("Failed to read immutable buf: {e:#}")))? + { ReadBufResult::Found(guard) => Ok(guard.into()), ReadBufResult::NotFound(write_guard) => { // Read the page from disk into the buffer diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 96cee922ff..23052ccee7 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -714,7 +714,7 @@ impl LayerMap { true } - pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { + pub fn iter_historic_layers(&self) -> impl ExactSizeIterator> { self.historic.iter() } diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index b3dc8e56a3..5ccc75fff6 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -504,7 +504,7 @@ impl BufferedHistoricLayerCoverage { } /// Iterate all the layers - pub fn iter(&self) -> impl '_ + Iterator { + pub fn iter(&self) -> impl ExactSizeIterator { // NOTE we can actually perform this without rebuilding, // but it's not necessary for now. if !self.buffer.is_empty() { diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs index cf0085c071..a42ac92973 100644 --- a/pageserver/src/tenant/layer_map/layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/layer_coverage.rs @@ -53,7 +53,7 @@ impl LayerCoverage { /// /// Complexity: O(log N) fn add_node(&mut self, key: i128) { - let value = match self.nodes.range(..=key).last() { + let value = match self.nodes.range(..=key).next_back() { Some((_, Some(v))) => Some(v.clone()), Some((_, None)) => None, None => None, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 61ad682a14..ac81b8e3d7 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -58,7 +58,7 @@ use crate::{InitializationOrder, TEMP_FILE_SUFFIX}; /// For a tenant that appears in TenantsMap, it may either be /// - `Attached`: has a full Tenant object, is elegible to service -/// reads and ingest WAL. +/// reads and ingest WAL. /// - `Secondary`: is only keeping a local cache warm. /// /// Secondary is a totally distinct state rather than being a mode of a `Tenant`, because diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 32c0571b97..10a13ef1a2 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -642,6 +642,7 @@ impl RemoteTimelineClient { cancel, ) .measure_remote_op( + Option::::None, RemoteOpFileKind::Index, RemoteOpKind::Download, Arc::clone(&self.metrics), @@ -739,6 +740,7 @@ impl RemoteTimelineClient { ctx, ) .measure_remote_op( + Some(ctx.task_kind()), RemoteOpFileKind::Layer, RemoteOpKind::Download, Arc::clone(&self.metrics), @@ -1968,9 +1970,7 @@ impl RemoteTimelineClient { /// Pick next tasks from the queue, and start as many of them as possible without violating /// the ordering constraints. /// - /// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does. - /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has - /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks. + /// The number of inprogress tasks is limited by `Self::inprogress_tasks`, see `next_ready`. fn launch_queued_tasks(self: &Arc, upload_queue: &mut UploadQueueInitialized) { while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() { debug!("starting op: {next_op}"); @@ -2177,6 +2177,7 @@ impl RemoteTimelineClient { &self.cancel, ) .measure_remote_op( + Some(TaskKind::RemoteUploadTask), RemoteOpFileKind::Layer, RemoteOpKind::Upload, Arc::clone(&self.metrics), @@ -2193,6 +2194,7 @@ impl RemoteTimelineClient { &self.cancel, ) .measure_remote_op( + Some(TaskKind::RemoteUploadTask), RemoteOpFileKind::Index, RemoteOpKind::Upload, Arc::clone(&self.metrics), @@ -2218,6 +2220,11 @@ impl RemoteTimelineClient { } res } + // TODO: this should wait for the deletion to be executed by the deletion queue. + // Otherwise, the deletion may race with an upload and wrongfully delete a newer + // file. Some of the above logic attempts to work around this, it should be replaced + // by the upload queue ordering guarantees (see `can_bypass`). See: + // . UploadOp::Delete(delete) => { if self.config.read().unwrap().block_deletions { let mut queue_locked = self.upload_queue.lock().unwrap(); diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 5635cf3268..a5cd8989aa 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -130,7 +130,7 @@ impl IndexPart { /// Version history /// - 2: added `deleted_at` /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers - /// is always generated from the keys of `layer_metadata`) + /// is always generated from the keys of `layer_metadata`) /// - 4: timeline_layers is fully removed. /// - 5: lineage was added /// - 6: last_aux_file_policy is added. diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs index 543ccc219d..7dba4508e2 100644 --- a/pageserver/src/tenant/remote_timeline_client/manifest.rs +++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs @@ -1,21 +1,33 @@ use chrono::NaiveDateTime; +use pageserver_api::shard::ShardStripeSize; use serde::{Deserialize, Serialize}; use utils::id::TimelineId; use utils::lsn::Lsn; -/// Tenant-shard scoped manifest -#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)] +/// Tenant shard manifest, stored in remote storage. Contains offloaded timelines and other tenant +/// shard-wide information that must be persisted in remote storage. +/// +/// The manifest is always updated on tenant attach, and as needed. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] pub struct TenantManifest { - /// Debugging aid describing the version of this manifest. - /// Can also be used for distinguishing breaking changes later on. + /// The manifest version. Incremented on manifest format changes, even non-breaking ones. + /// Manifests must generally always be backwards and forwards compatible for one release, to + /// allow release rollbacks. pub version: usize, + /// This tenant's stripe size. This is only advisory, and used to recover tenant data from + /// remote storage. The autoritative source is the storage controller. If None, assume the + /// original default value of 32768 blocks (256 MB). + #[serde(skip_serializing_if = "Option::is_none")] + pub stripe_size: Option, + /// The list of offloaded timelines together with enough information /// to not have to actually load them. /// /// Note: the timelines mentioned in this list might be deleted, i.e. /// we don't hold an invariant that the references aren't dangling. /// Existence of index-part.json is the actual indicator of timeline existence. + #[serde(default)] pub offloaded_timelines: Vec, } @@ -24,7 +36,7 @@ pub struct TenantManifest { /// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`], /// but the two datastructures serve different needs, this is for a persistent disk format /// that must be backwards compatible, while the other is only for informative purposes. -#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)] +#[derive(Clone, Debug, Serialize, Deserialize, Copy, PartialEq, Eq)] pub struct OffloadedTimelineManifest { pub timeline_id: TimelineId, /// Whether the timeline has a parent it has been branched off from or not @@ -35,20 +47,166 @@ pub struct OffloadedTimelineManifest { pub archived_at: NaiveDateTime, } -pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1; +/// The newest manifest version. This should be incremented on changes, even non-breaking ones. We +/// do not use deny_unknown_fields, so new fields are not breaking. +/// +/// 1: initial version +/// 2: +stripe_size +/// +/// When adding new versions, also add a parse_vX test case below. +pub const LATEST_TENANT_MANIFEST_VERSION: usize = 2; impl TenantManifest { - pub(crate) fn empty() -> Self { - Self { - version: LATEST_TENANT_MANIFEST_VERSION, - offloaded_timelines: vec![], + /// Returns true if the manifests are equal, ignoring the version number. This avoids + /// re-uploading all manifests just because the version number is bumped. + pub fn eq_ignoring_version(&self, other: &Self) -> bool { + // Fast path: if the version is equal, just compare directly. + if self.version == other.version { + return self == other; } - } - pub fn from_json_bytes(bytes: &[u8]) -> Result { - serde_json::from_slice::(bytes) + + // We could alternatively just clone and modify the version here. + let Self { + version: _, // ignore version + stripe_size, + offloaded_timelines, + } = self; + + stripe_size == &other.stripe_size && offloaded_timelines == &other.offloaded_timelines } - pub(crate) fn to_json_bytes(&self) -> serde_json::Result> { + /// Decodes a manifest from JSON. + pub fn from_json_bytes(bytes: &[u8]) -> Result { + serde_json::from_slice(bytes) + } + + /// Encodes a manifest as JSON. + pub fn to_json_bytes(&self) -> serde_json::Result> { serde_json::to_vec(self) } } + +#[cfg(test)] +mod tests { + use std::str::FromStr; + + use utils::id::TimelineId; + + use super::*; + + /// Empty manifests should be parsed. Version is required. + #[test] + fn parse_empty() -> anyhow::Result<()> { + let json = r#"{ + "version": 0 + }"#; + let expected = TenantManifest { + version: 0, + stripe_size: None, + offloaded_timelines: Vec::new(), + }; + assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?); + Ok(()) + } + + /// Unknown fields should be ignored, for forwards compatibility. + #[test] + fn parse_unknown_fields() -> anyhow::Result<()> { + let json = r#"{ + "version": 1, + "foo": "bar" + }"#; + let expected = TenantManifest { + version: 1, + stripe_size: None, + offloaded_timelines: Vec::new(), + }; + assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?); + Ok(()) + } + + /// v1 manifests should be parsed, for backwards compatibility. + #[test] + fn parse_v1() -> anyhow::Result<()> { + let json = r#"{ + "version": 1, + "offloaded_timelines": [ + { + "timeline_id": "5c4df612fd159e63c1b7853fe94d97da", + "archived_at": "2025-03-07T11:07:11.373105434" + }, + { + "timeline_id": "f3def5823ad7080d2ea538d8e12163fa", + "ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da", + "ancestor_retain_lsn": "0/1F79038", + "archived_at": "2025-03-05T11:10:22.257901390" + } + ] + }"#; + let expected = TenantManifest { + version: 1, + stripe_size: None, + offloaded_timelines: vec![ + OffloadedTimelineManifest { + timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?, + ancestor_timeline_id: None, + ancestor_retain_lsn: None, + archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?, + }, + OffloadedTimelineManifest { + timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?, + ancestor_timeline_id: Some(TimelineId::from_str( + "5c4df612fd159e63c1b7853fe94d97da", + )?), + ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?), + archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?, + }, + ], + }; + assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?); + Ok(()) + } + + /// v2 manifests should be parsed, for backwards compatibility. + #[test] + fn parse_v2() -> anyhow::Result<()> { + let json = r#"{ + "version": 2, + "stripe_size": 32768, + "offloaded_timelines": [ + { + "timeline_id": "5c4df612fd159e63c1b7853fe94d97da", + "archived_at": "2025-03-07T11:07:11.373105434" + }, + { + "timeline_id": "f3def5823ad7080d2ea538d8e12163fa", + "ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da", + "ancestor_retain_lsn": "0/1F79038", + "archived_at": "2025-03-05T11:10:22.257901390" + } + ] + }"#; + let expected = TenantManifest { + version: 2, + stripe_size: Some(ShardStripeSize(32768)), + offloaded_timelines: vec![ + OffloadedTimelineManifest { + timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?, + ancestor_timeline_id: None, + ancestor_retain_lsn: None, + archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?, + }, + OffloadedTimelineManifest { + timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?, + ancestor_timeline_id: Some(TimelineId::from_str( + "5c4df612fd159e63c1b7853fe94d97da", + )?), + ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?), + archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?, + }, + ], + }; + assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?); + Ok(()) + } +} diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 7d9f47665a..89f6136530 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -61,6 +61,7 @@ pub(crate) async fn upload_index_part( .await .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) } + /// Serializes and uploads the given tenant manifest data to the remote storage. pub(crate) async fn upload_tenant_manifest( storage: &GenericRemoteStorage, @@ -76,16 +77,14 @@ pub(crate) async fn upload_tenant_manifest( }); pausable_failpoint!("before-upload-manifest-pausable"); - let serialized = tenant_manifest.to_json_bytes()?; - let serialized = Bytes::from(serialized); - - let tenant_manifest_site = serialized.len(); - + let serialized = Bytes::from(tenant_manifest.to_json_bytes()?); + let tenant_manifest_size = serialized.len(); let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation); + storage .upload_storage_object( futures::stream::once(futures::future::ready(Ok(serialized))), - tenant_manifest_site, + tenant_manifest_size, &remote_path, cancel, ) diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index a378961620..2fa0ed9be9 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -167,10 +167,17 @@ impl SecondaryTenant { self.validate_metrics(); + // Metrics are subtracted from and/or removed eagerly. + // Deletions are done in the background via [`BackgroundPurges::spawn`]. let tenant_id = self.tenant_shard_id.tenant_id.to_string(); let shard_id = format!("{}", self.tenant_shard_id.shard_slug()); let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); + + self.detail + .lock() + .unwrap() + .drain_timelines(&self.tenant_shard_id, &self.resident_size_metric); } pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) { diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 1cf0241631..60cf7ac79e 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -4,6 +4,7 @@ use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant, SystemTime}; +use crate::metrics::{STORAGE_IO_SIZE, StorageIoSizeOperation}; use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; use futures::Future; @@ -124,15 +125,53 @@ impl OnDiskState { } } -#[derive(Debug, Clone, Default)] pub(super) struct SecondaryDetailTimeline { on_disk_layers: HashMap, /// We remember when layers were evicted, to prevent re-downloading them. pub(super) evicted_at: HashMap, + + ctx: RequestContext, +} + +impl Clone for SecondaryDetailTimeline { + fn clone(&self) -> Self { + Self { + on_disk_layers: self.on_disk_layers.clone(), + evicted_at: self.evicted_at.clone(), + // This is a bit awkward. The downloader code operates on a snapshot + // of the secondary list to avoid locking it for extended periods of time. + // No particularly strong reason to chose [`RequestContext::detached_child`], + // but makes more sense than [`RequestContext::attached_child`]. + ctx: self + .ctx + .detached_child(self.ctx.task_kind(), self.ctx.download_behavior()), + } + } +} + +impl std::fmt::Debug for SecondaryDetailTimeline { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SecondaryDetailTimeline") + .field("on_disk_layers", &self.on_disk_layers) + .field("evicted_at", &self.evicted_at) + .finish() + } } impl SecondaryDetailTimeline { + pub(super) fn empty(ctx: RequestContext) -> Self { + SecondaryDetailTimeline { + on_disk_layers: Default::default(), + evicted_at: Default::default(), + ctx, + } + } + + pub(super) fn context(&self) -> &RequestContext { + &self.ctx + } + pub(super) fn remove_layer( &mut self, name: &LayerName, @@ -258,18 +297,50 @@ impl SecondaryDetail { pub(super) fn remove_timeline( &mut self, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, resident_metric: &UIntGauge, ) { let removed = self.timelines.remove(timeline_id); if let Some(removed) = removed { - resident_metric.sub( - removed - .on_disk_layers - .values() - .map(|l| l.metadata.file_size) - .sum(), - ); + Self::clear_timeline_metrics(tenant_shard_id, timeline_id, removed, resident_metric); + } + } + + pub(super) fn drain_timelines( + &mut self, + tenant_shard_id: &TenantShardId, + resident_metric: &UIntGauge, + ) { + for (timeline_id, removed) in self.timelines.drain() { + Self::clear_timeline_metrics(tenant_shard_id, &timeline_id, removed, resident_metric); + } + } + + fn clear_timeline_metrics( + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + detail: SecondaryDetailTimeline, + resident_metric: &UIntGauge, + ) { + resident_metric.sub( + detail + .on_disk_layers + .values() + .map(|l| l.metadata.file_size) + .sum(), + ); + + let shard_id = format!("{}", tenant_shard_id.shard_slug()); + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let timeline_id = timeline_id.to_string(); + for op in StorageIoSizeOperation::VARIANTS { + let _ = STORAGE_IO_SIZE.remove_label_values(&[ + op, + tenant_id.as_str(), + shard_id.as_str(), + timeline_id.as_str(), + ]); } } @@ -727,6 +798,7 @@ impl<'a> TenantDownloader<'a> { last_heatmap, timeline, &self.secondary_state.resident_size_metric, + ctx, ) .await; @@ -774,7 +846,6 @@ impl<'a> TenantDownloader<'a> { // Download the layers in the heatmap for timeline in heatmap.timelines { - let ctx = &ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline.timeline_id); let timeline_state = timeline_states .remove(&timeline.timeline_id) .expect("Just populated above"); @@ -917,7 +988,11 @@ impl<'a> TenantDownloader<'a> { for delete_timeline in &delete_timelines { // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal // from disk fails that will be a fatal error. - detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric); + detail.remove_timeline( + self.secondary_state.get_tenant_shard_id(), + delete_timeline, + &self.secondary_state.resident_size_metric, + ); } } @@ -1013,7 +1088,6 @@ impl<'a> TenantDownloader<'a> { timeline: HeatMapTimeline, timeline_state: SecondaryDetailTimeline, deadline: Instant, - ctx: &RequestContext, ) -> (Result<(), UpdateError>, Vec) { // Accumulate updates to the state let mut touched = Vec::new(); @@ -1044,7 +1118,12 @@ impl<'a> TenantDownloader<'a> { } match self - .download_layer(tenant_shard_id, &timeline_id, layer, ctx) + .download_layer( + tenant_shard_id, + &timeline_id, + layer, + timeline_state.context(), + ) .await { Ok(Some(layer)) => touched.push(layer), @@ -1155,13 +1234,16 @@ impl<'a> TenantDownloader<'a> { tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count()); let (result, touched) = self - .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx) + .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline) .await; // Write updates to state to record layers we just downloaded or touched, irrespective of whether the overall result was successful { let mut detail = self.secondary_state.detail.lock().unwrap(); - let timeline_detail = detail.timelines.entry(timeline_id).or_default(); + let timeline_detail = detail.timelines.entry(timeline_id).or_insert_with(|| { + let ctx = ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline_id); + SecondaryDetailTimeline::empty(ctx) + }); tracing::info!("Wrote timeline_detail for {} touched layers", touched.len()); touched.into_iter().for_each(|t| { @@ -1295,10 +1377,12 @@ async fn init_timeline_state( last_heatmap: Option<&HeatMapTimeline>, heatmap: &HeatMapTimeline, resident_metric: &UIntGauge, + ctx: &RequestContext, ) -> SecondaryDetailTimeline { - let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id); - let mut detail = SecondaryDetailTimeline::default(); + let ctx = ctx.with_scope_secondary_timeline(tenant_shard_id, &heatmap.timeline_id); + let mut detail = SecondaryDetailTimeline::empty(ctx); + let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id); let mut dir = match tokio::fs::read_dir(&timeline_path).await { Ok(d) => d, Err(e) => { diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 8cc94b4e4d..c7ac50ca6a 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -33,7 +33,7 @@ pub struct ModelInputs { } /// A [`Segment`], with some extra information for display purposes -#[derive(Debug, serde::Serialize, serde::Deserialize)] +#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq, Eq)] pub struct SegmentMeta { pub segment: Segment, pub timeline_id: TimelineId, @@ -248,6 +248,8 @@ pub(super) async fn gather_inputs( None }; + let branch_is_invisible = timeline.is_invisible() == Some(true); + let lease_points = gc_info .leases .keys() @@ -271,7 +273,10 @@ pub(super) async fn gather_inputs( .map(|(lsn, _child_id, _is_offloaded)| (lsn, LsnKind::BranchPoint)) .collect::>(); - lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); + if !branch_is_invisible { + // Do not count lease points for invisible branches. + lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); + } drop(gc_info); @@ -287,7 +292,9 @@ pub(super) async fn gather_inputs( // Add a point for the PITR cutoff let branch_start_needed = next_pitr_cutoff <= branch_start_lsn; - if !branch_start_needed { + if !branch_start_needed && !branch_is_invisible { + // Only add the GcCutOff point when the timeline is visible; otherwise, do not compute the size for the LSN + // range from the last branch point to the latest data. lsns.push((next_pitr_cutoff, LsnKind::GcCutOff)); } @@ -373,11 +380,19 @@ pub(super) async fn gather_inputs( } } + let branch_end_lsn = if branch_is_invisible { + // If the branch is invisible, the branch end is the last requested LSN (likely a branch cutoff point). + segments.last().unwrap().segment.lsn + } else { + // Otherwise, the branch end is the last record LSN. + last_record_lsn.0 + }; + // Current end of the timeline segments.push(SegmentMeta { segment: Segment { parent: Some(parent), - lsn: last_record_lsn.0, + lsn: branch_end_lsn, size: None, // Filled in later, if necessary needed: true, }, @@ -609,6 +624,7 @@ async fn calculate_logical_size( Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res)) } +#[cfg(test)] #[test] fn verify_size_for_multiple_branches() { // this is generated from integration test test_tenant_size_with_multiple_branches, but this way @@ -766,6 +782,7 @@ fn verify_size_for_multiple_branches() { assert_eq!(inputs.calculate(), 37_851_408); } +#[cfg(test)] #[test] fn verify_size_for_one_branch() { let doc = r#" diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index ece163b24a..796ad01e54 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -13,13 +13,13 @@ pub mod merge_iterator; use std::cmp::Ordering; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; -use std::future::Future; use std::ops::Range; use std::pin::Pin; use std::sync::Arc; use std::sync::atomic::AtomicUsize; use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use crate::PERF_TRACE_TARGET; pub use batch_split_writer::{BatchLayerWriter, SplitDeltaLayerWriter, SplitImageLayerWriter}; use bytes::Bytes; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; @@ -34,7 +34,7 @@ use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use pageserver_api::record::NeonWalRecord; use pageserver_api::value::Value; -use tracing::{Instrument, trace}; +use tracing::{Instrument, info_span, trace}; use utils::lsn::Lsn; use utils::sync::gate::GateGuard; @@ -43,7 +43,9 @@ use super::PageReconstructError; use super::layer_map::InMemoryLayerDesc; use super::timeline::{GetVectoredError, ReadPath}; use crate::config::PageServerConf; -use crate::context::{AccessStatsBehavior, RequestContext}; +use crate::context::{ + AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, +}; pub fn range_overlaps(a: &Range, b: &Range) -> bool where @@ -713,13 +715,34 @@ pub(crate) enum LayerId { } /// Uniquely identify a layer visit by the layer -/// and LSN floor (or start LSN) of the reads. -/// The layer itself is not enough since we may -/// have different LSN lower bounds for delta layer reads. +/// and LSN range of the reads. Note that the end of the range is exclusive. +/// +/// The layer itself is not enough since we may have different LSN lower +/// bounds for delta layer reads. Scenarios where this can happen are: +/// +/// 1. Layer overlaps: imagine an image layer inside and in-memory layer +/// and a query that only partially hits the image layer. Part of the query +/// needs to read the whole in-memory layer and the other part needs to read +/// only up to the image layer. Hence, they'll have different LSN floor values +/// for the read. +/// +/// 2. Scattered reads: the read path supports starting at different LSNs. Imagine +/// The start LSN for one range is inside a layer and the start LSN for another range +/// Is above the layer (includes all of it). Both ranges need to read the layer all the +/// Way to the end but starting at different points. Hence, they'll have different LSN +/// Ceil values. +/// +/// The implication is that we might visit the same layer multiple times +/// in order to read different LSN ranges from it. In practice, this isn't very concerning +/// because: +/// 1. Layer overlaps are rare and generally not intended +/// 2. Scattered reads will stabilise after the first few layers provided their starting LSNs +/// are grouped tightly enough (likely the case). #[derive(Debug, PartialEq, Eq, Clone, Hash)] struct LayerToVisitId { layer_id: LayerId, lsn_floor: Lsn, + lsn_ceil: Lsn, } #[derive(Debug, PartialEq, Eq, Hash)] @@ -803,6 +826,7 @@ impl LayerFringe { let layer_to_visit_id = LayerToVisitId { layer_id: layer.id(), lsn_floor: lsn_range.start, + lsn_ceil: lsn_range.end, }; let entry = self.visit_reads.entry(layer_to_visit_id.clone()); @@ -874,13 +898,37 @@ impl ReadableLayer { ) -> Result<(), GetVectoredError> { match self { ReadableLayer::PersistentLayer(layer) => { + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "PLAN_LAYER", + layer = %layer + ) + }) + .attached_child(); + layer - .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx) + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await } ReadableLayer::InMemoryLayer(layer) => { + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "PLAN_LAYER", + layer = %layer + ) + }) + .attached_child(); + layer - .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx) + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await } } diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index fd50e4805d..39cd02d101 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use bytes::Bytes; use pageserver_api::key::{KEY_SIZE, Key}; use pageserver_api::value::Value; +use tokio_util::sync::CancellationToken; use utils::id::TimelineId; use utils::lsn::Lsn; use utils::shard::TenantShardId; @@ -179,7 +180,7 @@ impl BatchLayerWriter { /// An image writer that takes images and produces multiple image layers. #[must_use] -pub struct SplitImageLayerWriter { +pub struct SplitImageLayerWriter<'a> { inner: ImageLayerWriter, target_layer_size: u64, lsn: Lsn, @@ -188,9 +189,12 @@ pub struct SplitImageLayerWriter { tenant_shard_id: TenantShardId, batches: BatchLayerWriter, start_key: Key, + gate: &'a utils::sync::gate::Gate, + cancel: CancellationToken, } -impl SplitImageLayerWriter { +impl<'a> SplitImageLayerWriter<'a> { + #[allow(clippy::too_many_arguments)] pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, @@ -198,6 +202,8 @@ impl SplitImageLayerWriter { start_key: Key, lsn: Lsn, target_layer_size: u64, + gate: &'a utils::sync::gate::Gate, + cancel: CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { @@ -208,6 +214,8 @@ impl SplitImageLayerWriter { tenant_shard_id, &(start_key..Key::MAX), lsn, + gate, + cancel.clone(), ctx, ) .await?, @@ -217,6 +225,8 @@ impl SplitImageLayerWriter { batches: BatchLayerWriter::new(conf).await?, lsn, start_key, + gate, + cancel, }) } @@ -239,6 +249,8 @@ impl SplitImageLayerWriter { self.tenant_shard_id, &(key..Key::MAX), self.lsn, + self.gate, + self.cancel.clone(), ctx, ) .await?; @@ -291,7 +303,7 @@ impl SplitImageLayerWriter { /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm /// will split them into multiple files based on size. #[must_use] -pub struct SplitDeltaLayerWriter { +pub struct SplitDeltaLayerWriter<'a> { inner: Option<(Key, DeltaLayerWriter)>, target_layer_size: u64, conf: &'static PageServerConf, @@ -300,15 +312,19 @@ pub struct SplitDeltaLayerWriter { lsn_range: Range, last_key_written: Key, batches: BatchLayerWriter, + gate: &'a utils::sync::gate::Gate, + cancel: CancellationToken, } -impl SplitDeltaLayerWriter { +impl<'a> SplitDeltaLayerWriter<'a> { pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, lsn_range: Range, target_layer_size: u64, + gate: &'a utils::sync::gate::Gate, + cancel: CancellationToken, ) -> anyhow::Result { Ok(Self { target_layer_size, @@ -319,6 +335,8 @@ impl SplitDeltaLayerWriter { lsn_range, last_key_written: Key::MIN, batches: BatchLayerWriter::new(conf).await?, + gate, + cancel, }) } @@ -344,6 +362,8 @@ impl SplitDeltaLayerWriter { self.tenant_shard_id, key, self.lsn_range.clone(), + self.gate, + self.cancel.clone(), ctx, ) .await?, @@ -362,11 +382,13 @@ impl SplitDeltaLayerWriter { self.tenant_shard_id, key, self.lsn_range.clone(), + self.gate, + self.cancel.clone(), ctx, ) .await?; let (start_key, prev_delta_writer) = - std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap(); + self.inner.replace((key, next_delta_writer)).unwrap(); self.batches.add_unfinished_delta_writer( prev_delta_writer, start_key..key, @@ -469,6 +491,8 @@ mod tests { get_key(0), Lsn(0x18), 4 * 1024 * 1024, + &tline.gate, + tline.cancel.clone(), &ctx, ) .await @@ -480,6 +504,8 @@ mod tests { tenant.tenant_shard_id, Lsn(0x18)..Lsn(0x20), 4 * 1024 * 1024, + &tline.gate, + tline.cancel.clone(), ) .await .unwrap(); @@ -546,6 +572,8 @@ mod tests { get_key(0), Lsn(0x18), 4 * 1024 * 1024, + &tline.gate, + tline.cancel.clone(), &ctx, ) .await @@ -556,6 +584,8 @@ mod tests { tenant.tenant_shard_id, Lsn(0x18)..Lsn(0x20), 4 * 1024 * 1024, + &tline.gate, + tline.cancel.clone(), ) .await .unwrap(); @@ -643,6 +673,8 @@ mod tests { get_key(0), Lsn(0x18), 4 * 1024, + &tline.gate, + tline.cancel.clone(), &ctx, ) .await @@ -654,6 +686,8 @@ mod tests { tenant.tenant_shard_id, Lsn(0x18)..Lsn(0x20), 4 * 1024, + &tline.gate, + tline.cancel.clone(), ) .await .unwrap(); @@ -730,6 +764,8 @@ mod tests { tenant.tenant_shard_id, Lsn(0x10)..Lsn(N as u64 * 16 + 0x10), 4 * 1024 * 1024, + &tline.gate, + tline.cancel.clone(), ) .await .unwrap(); diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 62adae1680..4417b8aa51 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -50,6 +50,7 @@ use rand::distributions::Alphanumeric; use serde::{Deserialize, Serialize}; use tokio::sync::OnceCell; use tokio_epoll_uring::IoBuf; +use tokio_util::sync::CancellationToken; use tracing::*; use utils::bin_ser::BeSer; use utils::id::{TenantId, TimelineId}; @@ -400,12 +401,15 @@ impl DeltaLayerWriterInner { /// /// Start building a new delta layer. /// + #[allow(clippy::too_many_arguments)] async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, + gate: &utils::sync::gate::Gate, + cancel: CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { // Create the file initially with a temporary filename. We don't know @@ -420,7 +424,7 @@ impl DeltaLayerWriterInner { let mut file = VirtualFile::create(&path, ctx).await?; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?; - let blob_writer = BlobWriter::new(file, PAGE_SZ as u64); + let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx); // Initialize the b-tree index builder let block_buf = BlockBuf::new(); @@ -628,12 +632,15 @@ impl DeltaLayerWriter { /// /// Start building a new delta layer. /// + #[allow(clippy::too_many_arguments)] pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, + gate: &utils::sync::gate::Gate, + cancel: CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { @@ -644,6 +651,8 @@ impl DeltaLayerWriter { tenant_shard_id, key_start, lsn_range, + gate, + cancel, ctx, ) .await?, @@ -896,9 +905,9 @@ impl DeltaLayerInner { where Reader: BlockReader + Clone, { - let ctx = RequestContextBuilder::extend(ctx) + let ctx = RequestContextBuilder::from(ctx) .page_content_kind(PageContentKind::DeltaLayerBtreeNode) - .build(); + .attached_child(); for range in keyspace.ranges.iter() { let mut range_end_handled = false; @@ -1105,9 +1114,9 @@ impl DeltaLayerInner { all_keys.push(entry); true }, - &RequestContextBuilder::extend(ctx) + &RequestContextBuilder::from(ctx) .page_content_kind(PageContentKind::DeltaLayerBtreeNode) - .build(), + .attached_child(), ) .await?; if let Some(last) = all_keys.last_mut() { @@ -1885,6 +1894,8 @@ pub(crate) mod test { harness.tenant_shard_id, entries_meta.key_range.start, entries_meta.lsn_range.clone(), + &timeline.gate, + timeline.cancel.clone(), &ctx, ) .await?; @@ -2079,6 +2090,8 @@ pub(crate) mod test { tenant.tenant_shard_id, Key::MIN, Lsn(0x11)..truncate_at, + &branch.gate, + branch.cancel.clone(), ctx, ) .await @@ -2213,6 +2226,8 @@ pub(crate) mod test { tenant.tenant_shard_id, *key_start, (*lsn_min)..lsn_end, + &tline.gate, + tline.cancel.clone(), ctx, ) .await?; diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index b211eb5416..3744d615f2 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -48,6 +48,7 @@ use rand::distributions::Alphanumeric; use serde::{Deserialize, Serialize}; use tokio::sync::OnceCell; use tokio_stream::StreamExt; +use tokio_util::sync::CancellationToken; use tracing::*; use utils::bin_ser::BeSer; use utils::id::{TenantId, TimelineId}; @@ -481,9 +482,9 @@ impl ImageLayerInner { let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); - let ctx = RequestContextBuilder::extend(ctx) + let ctx = RequestContextBuilder::from(ctx) .page_content_kind(PageContentKind::ImageLayerBtreeNode) - .build(); + .attached_child(); for range in keyspace.ranges.iter() { let mut range_end_handled = false; @@ -748,12 +749,15 @@ impl ImageLayerWriterInner { /// /// Start building a new image layer. /// + #[allow(clippy::too_many_arguments)] async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, + gate: &utils::sync::gate::Gate, + cancel: CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { // Create the file initially with a temporary filename. @@ -780,7 +784,7 @@ impl ImageLayerWriterInner { }; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?; - let blob_writer = BlobWriter::new(file, PAGE_SZ as u64); + let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx); // Initialize the b-tree index builder let block_buf = BlockBuf::new(); @@ -988,18 +992,30 @@ impl ImageLayerWriter { /// /// Start building a new image layer. /// + #[allow(clippy::too_many_arguments)] pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, + gate: &utils::sync::gate::Gate, + cancel: CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { inner: Some( - ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx) - .await?, + ImageLayerWriterInner::new( + conf, + timeline_id, + tenant_shard_id, + key_range, + lsn, + gate, + cancel, + ctx, + ) + .await?, ), }) } @@ -1192,7 +1208,7 @@ mod test { // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); - let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap(); let range = input_start..input_end; // Build an image layer to filter @@ -1203,6 +1219,8 @@ mod test { harness.tenant_shard_id, &range, lsn, + &timeline.gate, + timeline.cancel.clone(), &ctx, ) .await @@ -1235,7 +1253,7 @@ mod test { let shard_identity = ShardIdentity::new( ShardNumber(shard_number), shard_count, - ShardStripeSize(0x8000), + ShardStripeSize(0x800), ) .unwrap(); let harness = TenantHarness::create_custom( @@ -1268,6 +1286,8 @@ mod test { harness.tenant_shard_id, &range, lsn, + &timeline.gate, + timeline.cancel.clone(), &ctx, ) .await @@ -1287,12 +1307,12 @@ mod test { // This exact size and those below will need updating as/when the layer encoding changes, but // should be deterministic for a given version of the format, as we used no randomness generating the input. - assert_eq!(original_size, 1597440); + assert_eq!(original_size, 122880); match shard_number { 0 => { // We should have written out just one stripe for our shard identity - assert_eq!(wrote_keys, 0x8000); + assert_eq!(wrote_keys, 0x800); let replacement = replacement.unwrap(); // We should have dropped some of the data @@ -1300,7 +1320,7 @@ mod test { assert!(replacement.metadata().file_size > 0); // Assert that we dropped ~3/4 of the data. - assert_eq!(replacement.metadata().file_size, 417792); + assert_eq!(replacement.metadata().file_size, 49152); } 1 => { // Shard 1 has no keys in our input range @@ -1309,19 +1329,19 @@ mod test { } 2 => { // Shard 2 has one stripes in the input range - assert_eq!(wrote_keys, 0x8000); + assert_eq!(wrote_keys, 0x800); let replacement = replacement.unwrap(); assert!(replacement.metadata().file_size < original_size); assert!(replacement.metadata().file_size > 0); - assert_eq!(replacement.metadata().file_size, 417792); + assert_eq!(replacement.metadata().file_size, 49152); } 3 => { // Shard 3 has two stripes in the input range - assert_eq!(wrote_keys, 0x10000); + assert_eq!(wrote_keys, 0x1000); let replacement = replacement.unwrap(); assert!(replacement.metadata().file_size < original_size); assert!(replacement.metadata().file_size > 0); - assert_eq!(replacement.metadata().file_size, 811008); + assert_eq!(replacement.metadata().file_size, 73728); } _ => unreachable!(), } @@ -1346,6 +1366,8 @@ mod test { tenant.tenant_shard_id, &key_range, lsn, + &tline.gate, + tline.cancel.clone(), ctx, ) .await?; diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index bb4ae38ad1..5d558e66cc 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -421,9 +421,9 @@ impl InMemoryLayer { reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { - let ctx = RequestContextBuilder::extend(ctx) + let ctx = RequestContextBuilder::from(ctx) .page_content_kind(PageContentKind::InMemoryLayer) - .build(); + .attached_child(); let inner = self.inner.read().await; @@ -719,6 +719,8 @@ impl InMemoryLayer { ctx: &RequestContext, key_range: Option>, l0_flush_global_state: &l0_flush::Inner, + gate: &utils::sync::gate::Gate, + cancel: CancellationToken, ) -> Result> { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the @@ -759,6 +761,8 @@ impl InMemoryLayer { self.tenant_shard_id, Key::MIN, self.start_lsn..end_lsn, + gate, + cancel, ctx, ) .await?; diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs index 90455fd0ca..ea354fc716 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -766,7 +766,7 @@ mod tests { rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs Ok((dst, len)) } - Err(e) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)), + Err(e) => Err(std::io::Error::other(e)), } } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 247092bf45..b7f6e5dc77 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -3,12 +3,13 @@ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; use std::time::{Duration, SystemTime}; +use crate::PERF_TRACE_TARGET; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::HistoricLayerInfo; use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; -use tracing::Instrument; +use tracing::{Instrument, info_span}; use utils::generation::Generation; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -18,7 +19,7 @@ use super::delta_layer::{self}; use super::image_layer::{self}; use super::{ AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, - LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState, + LayerVisibilityHint, PerfInstrumentFutureExt, PersistentLayerDesc, ValuesReconstructState, }; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; @@ -324,16 +325,29 @@ impl Layer { reconstruct_data: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { - let downloaded = + let downloaded = { + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "GET_LAYER", + ) + }) + .attached_child(); + self.0 - .get_or_maybe_download(true, ctx) + .get_or_maybe_download(true, &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_context| crnt_perf_context.clone()) .await .map_err(|err| match err { DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => { GetVectoredError::Cancelled } other => GetVectoredError::Other(anyhow::anyhow!(other)), - })?; + })? + }; + let this = ResidentLayer { downloaded: downloaded.clone(), owner: self.clone(), @@ -341,9 +355,20 @@ impl Layer { self.record_access(ctx); + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "VISIT_LAYER", + ) + }) + .attached_child(); + downloaded - .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx) + .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, &ctx) .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self)) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await .map_err(|err| match err { GetVectoredError::Other(err) => GetVectoredError::Other( @@ -950,6 +975,10 @@ impl LayerInner { allow_download: bool, ctx: &RequestContext, ) -> Result, DownloadError> { + let mut wait_for_download_recorder = + scopeguard::guard(utils::elapsed_accum::ElapsedAccum::default(), |accum| { + ctx.ondemand_download_wait_observe(accum.get()); + }); let (weak, permit) = { // get_or_init_detached can: // - be fast (mutex lock) OR uncontested semaphore permit acquire @@ -958,7 +987,7 @@ impl LayerInner { let locked = self .inner - .get_or_init_detached() + .get_or_init_detached_measured(Some(&mut wait_for_download_recorder)) .await .map(|mut guard| guard.get_and_upgrade().ok_or(guard)); @@ -988,6 +1017,7 @@ impl LayerInner { Err(permit) => (None, permit), } }; + let _guard = wait_for_download_recorder.guard(); if let Some(weak) = weak { // only drop the weak after dropping the heavier_once_cell guard @@ -1045,15 +1075,34 @@ impl LayerInner { return Err(DownloadError::DownloadRequired); } - let download_ctx = ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download); + let ctx = if ctx.has_perf_span() { + let dl_ctx = RequestContextBuilder::from(ctx) + .task_kind(TaskKind::LayerDownload) + .download_behavior(DownloadBehavior::Download) + .root_perf_span(|| { + info_span!( + target: PERF_TRACE_TARGET, + "DOWNLOAD_LAYER", + layer = %self, + reason = %reason + ) + }) + .detached_child(); + ctx.perf_follows_from(&dl_ctx); + dl_ctx + } else { + ctx.attached_child() + }; async move { tracing::info!(%reason, "downloading on-demand"); let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); let res = self - .download_init_and_wait(timeline, permit, download_ctx) + .download_init_and_wait(timeline, permit, ctx.attached_child()) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await?; + scopeguard::ScopeGuard::into_inner(init_cancelled); Ok(res) } @@ -1158,6 +1207,7 @@ impl LayerInner { permit: heavier_once_cell::InitPermit, ctx: &RequestContext, ) -> Result, remote_storage::DownloadError> { + let start = std::time::Instant::now(); let result = timeline .remote_client .download_layer_file( @@ -1169,7 +1219,8 @@ impl LayerInner { ctx, ) .await; - + let latency = start.elapsed(); + let latency_millis = u64::try_from(latency.as_millis()).unwrap(); match result { Ok(size) => { assert_eq!(size, self.desc.file_size); @@ -1185,9 +1236,8 @@ impl LayerInner { Err(e) => { panic!("post-condition failed: needs_download errored: {e:?}"); } - } - - tracing::info!(size=%self.desc.file_size, "on-demand download successful"); + }; + tracing::info!(size=%self.desc.file_size, %latency_millis, "on-demand download successful"); timeline .metrics .resident_physical_size_add(self.desc.file_size); @@ -1216,7 +1266,7 @@ impl LayerInner { return Err(e); } - tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); + tracing::error!(consecutive_failures, %latency_millis, "layer file download failed: {e:#}"); let backoff = utils::backoff::exponential_backoff_duration_seconds( consecutive_failures.min(u32::MAX as usize) as u32, @@ -1720,9 +1770,9 @@ impl DownloadedLayer { ); let res = if owner.desc.is_delta { - let ctx = RequestContextBuilder::extend(ctx) + let ctx = RequestContextBuilder::from(ctx) .page_content_kind(crate::context::PageContentKind::DeltaLayerSummary) - .build(); + .attached_child(); let summary = Some(delta_layer::Summary::expected( owner.desc.tenant_shard_id.tenant_id, owner.desc.timeline_id, @@ -1738,9 +1788,9 @@ impl DownloadedLayer { .await .map(LayerKind::Delta) } else { - let ctx = RequestContextBuilder::extend(ctx) + let ctx = RequestContextBuilder::from(ctx) .page_content_kind(crate::context::PageContentKind::ImageLayerSummary) - .build(); + .attached_child(); let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( owner.desc.tenant_shard_id.tenant_id, diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 7086429bfe..b6fd4678d6 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -119,6 +119,10 @@ async fn smoke_test() { let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); assert!(matches!(e, EvictionError::NotFound)); + let dl_ctx = RequestContextBuilder::from(ctx) + .download_behavior(DownloadBehavior::Download) + .attached_child(); + // on accesses when the layer is evicted, it will automatically be downloaded. let img_after = { let mut data = ValuesReconstructState::new(io_concurrency.clone()); @@ -127,7 +131,7 @@ async fn smoke_test() { controlfile_keyspace.clone(), Lsn(0x10)..Lsn(0x11), &mut data, - ctx, + &dl_ctx, ) .instrument(download_span.clone()) .await @@ -177,7 +181,7 @@ async fn smoke_test() { // plain downloading is rarely needed layer - .download_and_keep_resident(ctx) + .download_and_keep_resident(&dl_ctx) .instrument(download_span) .await .unwrap(); @@ -645,9 +649,10 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { let ctx = ctx.with_scope_timeline(&timeline); // This test does downloads - let ctx = RequestContextBuilder::extend(&ctx) + let ctx = RequestContextBuilder::from(&ctx) .download_behavior(DownloadBehavior::Download) - .build(); + .attached_child(); + let layer = { let mut layers = { let layers = timeline.layers.read().await; @@ -730,9 +735,9 @@ async fn evict_and_wait_does_not_wait_for_download() { let ctx = ctx.with_scope_timeline(&timeline); // This test does downloads - let ctx = RequestContextBuilder::extend(&ctx) + let ctx = RequestContextBuilder::from(&ctx) .download_behavior(DownloadBehavior::Download) - .build(); + .attached_child(); let layer = { let mut layers = { diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index 76cdddd06a..55db9fe06a 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -59,6 +59,7 @@ impl LayerIterRef<'_> { /// 1. Unified iterator for image and delta layers. /// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge). /// 3. Lazy creation of the real delta/image iterator. +#[allow(clippy::large_enum_variant, reason = "TODO")] pub(crate) enum IteratorWrapper<'a> { NotLoaded { ctx: &'a RequestContext, diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 034e5f8c91..54588e788c 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -268,7 +268,12 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { error_run += 1; let backoff = exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS); - log_compaction_error(&err, Some((error_run, backoff)), cancel.is_cancelled()); + log_compaction_error( + &err, + Some((error_run, backoff)), + cancel.is_cancelled(), + false, + ); continue; } } @@ -285,6 +290,7 @@ pub(crate) fn log_compaction_error( err: &CompactionError, retry_info: Option<(u32, Duration)>, task_cancelled: bool, + degrade_to_warning: bool, ) { use CompactionError::*; @@ -333,6 +339,7 @@ pub(crate) fn log_compaction_error( } } else { match level { + Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"), Level::ERROR => error!("Compaction failed: {err:#}"), Level::INFO => info!("Compaction failed: {err:#}"), level => unimplemented!("unexpected level {level:?}"), diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 75f9225302..c27a4b62da 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -23,6 +23,8 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; +use crate::PERF_TRACE_TARGET; +use crate::walredo::RedoAttemptType; use anyhow::{Context, Result, anyhow, bail, ensure}; use arc_swap::{ArcSwap, ArcSwapOption}; use bytes::Bytes; @@ -96,7 +98,9 @@ use super::{ }; use crate::aux_file::AuxFileSizeEstimator; use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext}; +use crate::context::{ + DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, +}; use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32}; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::l0_flush::{self, L0FlushGlobalState}; @@ -112,7 +116,7 @@ use crate::pgdatadir_mapping::{ use crate::task_mgr::TaskKind; use crate::tenant::config::AttachmentMode; use crate::tenant::gc_result::GcResult; -use crate::tenant::layer_map::{LayerMap, SearchResult}; +use crate::tenant::layer_map::LayerMap; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::delta_layer::DeltaEntry; use crate::tenant::storage_layer::inmemory_layer::IndexEntry; @@ -581,7 +585,7 @@ pub(crate) enum PageReconstructError { WalRedo(anyhow::Error), #[error("{0}")] - MissingKey(MissingKeyError), + MissingKey(Box), } impl From for PageReconstructError { @@ -686,16 +690,23 @@ impl std::fmt::Display for ReadPath { #[derive(thiserror::Error)] pub struct MissingKeyError { - key: Key, + keyspace: KeySpace, shard: ShardNumber, - cont_lsn: Lsn, - request_lsn: Lsn, + query: Option, + // This is largest request LSN from the get page request batch + original_hwm_lsn: Lsn, ancestor_lsn: Option, /// Debug information about the read path if there's an error read_path: Option, backtrace: Option, } +impl MissingKeyError { + fn enrich(&mut self, query: VersionedKeySpaceQuery) { + self.query = Some(query); + } +} + impl std::fmt::Debug for MissingKeyError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self) @@ -706,14 +717,18 @@ impl std::fmt::Display for MissingKeyError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}", - self.key, self.shard, self.cont_lsn, self.request_lsn + "could not find data for key {} (shard {:?}), original HWM LSN {}", + self.keyspace, self.shard, self.original_hwm_lsn )?; if let Some(ref ancestor_lsn) = self.ancestor_lsn { write!(f, ", ancestor {}", ancestor_lsn)?; } + if let Some(ref query) = self.query { + write!(f, ", query {}", query)?; + } + if let Some(ref read_path) = self.read_path { write!(f, "\n{}", read_path)?; } @@ -813,7 +828,7 @@ pub(crate) enum GetVectoredError { InvalidLsn(Lsn), #[error("requested key not found: {0}")] - MissingKey(MissingKeyError), + MissingKey(Box), #[error("ancestry walk")] GetReadyAncestorError(#[source] GetReadyAncestorError), @@ -870,9 +885,14 @@ pub(crate) enum CompactFlags { OnlyL0Compaction, EnhancedGcBottomMostCompaction, DryRun, - /// Disables compaction yielding e.g. due to high L0 count. This is set e.g. when requesting - /// compaction via HTTP API. - NoYield, + /// Makes image compaction yield if there's pending L0 compaction. This should always be used in + /// the background compaction task, since we want to aggressively compact down L0 to bound + /// read amplification. + /// + /// It only makes sense to use this when `compaction_l0_first` is enabled (such that we yield to + /// an L0 compaction pass), and without `OnlyL0Compaction` (L0 compaction shouldn't yield for L0 + /// compaction). + YieldForL0, } #[serde_with::serde_as] @@ -890,6 +910,12 @@ pub(crate) struct CompactRequest { pub sub_compaction_max_job_size_mb: Option, } +#[derive(Debug, Clone, serde::Deserialize)] +pub(crate) struct MarkInvisibleRequest { + #[serde(default)] + pub is_visible: Option, +} + #[derive(Debug, Clone, Default)] pub(crate) struct CompactOptions { pub flags: EnumSet, @@ -913,7 +939,7 @@ impl std::fmt::Debug for Timeline { } } -#[derive(thiserror::Error, Debug)] +#[derive(thiserror::Error, Debug, Clone)] pub(crate) enum WaitLsnError { // Called on a timeline which is shutting down #[error("Shutdown")] @@ -1025,6 +1051,7 @@ pub(crate) enum ShutdownMode { Hard, } +#[allow(clippy::large_enum_variant, reason = "TODO")] enum ImageLayerCreationOutcome { /// We generated an image layer Generated { @@ -1112,14 +1139,12 @@ impl Timeline { // page_service. debug_assert!(!self.shard_identity.is_key_disposable(&key)); - let keyspace = KeySpace { - ranges: vec![key..key.next()], - }; - let mut reconstruct_state = ValuesReconstructState::new(IoConcurrency::sequential()); + let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn); + let vectored_res = self - .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .get_vectored_impl(query, &mut reconstruct_state, ctx) .await; let key_value = vectored_res?.pop_first(); @@ -1137,15 +1162,17 @@ impl Timeline { value } } - None => Err(PageReconstructError::MissingKey(MissingKeyError { - key, - shard: self.shard_identity.get_shard_number(&key), - cont_lsn: Lsn(0), - request_lsn: lsn, - ancestor_lsn: None, - backtrace: None, - read_path: None, - })), + None => Err(PageReconstructError::MissingKey(Box::new( + MissingKeyError { + keyspace: KeySpace::single(key..key.next()), + shard: self.shard_identity.get_shard_number(&key), + original_hwm_lsn: lsn, + ancestor_lsn: None, + backtrace: None, + read_path: None, + query: None, + }, + ))), } } @@ -1158,21 +1185,18 @@ impl Timeline { /// which actually vectorizes the read path. pub(crate) async fn get_vectored( &self, - keyspace: KeySpace, - lsn: Lsn, + query: VersionedKeySpaceQuery, io_concurrency: super::storage_layer::IoConcurrency, ctx: &RequestContext, ) -> Result>, GetVectoredError> { - if !lsn.is_valid() { - return Err(GetVectoredError::InvalidLsn(lsn)); - } + let total_keyspace = query.total_keyspace(); - let key_count = keyspace.total_raw_size().try_into().unwrap(); + let key_count = total_keyspace.total_raw_size().try_into().unwrap(); if key_count > Timeline::MAX_GET_VECTORED_KEYS { return Err(GetVectoredError::Oversized(key_count)); } - for range in &keyspace.ranges { + for range in &total_keyspace.ranges { let mut key = range.start; while key != range.end { assert!(!self.shard_identity.is_key_disposable(&key)); @@ -1181,9 +1205,8 @@ impl Timeline { } trace!( - "get vectored request for {:?}@{} from task kind {:?}", - keyspace, - lsn, + "get vectored query {} from task kind {:?}", + query, ctx.task_kind(), ); @@ -1192,12 +1215,7 @@ impl Timeline { .map(|metric| (metric, Instant::now())); let res = self - .get_vectored_impl( - keyspace.clone(), - lsn, - &mut ValuesReconstructState::new(io_concurrency), - ctx, - ) + .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx) .await; if let Some((metric, start)) = start { @@ -1248,13 +1266,10 @@ impl Timeline { .for_task_kind(ctx.task_kind()) .map(ScanLatencyOngoingRecording::start_recording); + let query = VersionedKeySpaceQuery::uniform(keyspace, lsn); + let vectored_res = self - .get_vectored_impl( - keyspace.clone(), - lsn, - &mut ValuesReconstructState::new(io_concurrency), - ctx, - ) + .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx) .await; if let Some(recording) = start { @@ -1266,21 +1281,43 @@ impl Timeline { pub(super) async fn get_vectored_impl( &self, - keyspace: KeySpace, - lsn: Lsn, + query: VersionedKeySpaceQuery, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() { - Some(ReadPath::new(keyspace.clone(), lsn)) + Some(ReadPath::new( + query.total_keyspace(), + query.high_watermark_lsn()?, + )) } else { None }; + reconstruct_state.read_path = read_path; - let traversal_res: Result<(), _> = self - .get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx) - .await; + let redo_attempt_type = if ctx.task_kind() == TaskKind::Compaction { + RedoAttemptType::LegacyCompaction + } else { + RedoAttemptType::ReadPage + }; + + let traversal_res: Result<(), _> = { + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "PLAN_IO", + ) + }) + .attached_child(); + + self.get_vectored_reconstruct_data(query.clone(), reconstruct_state, &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) + .await + }; + if let Err(err) = traversal_res { // Wait for all the spawned IOs to complete. // See comments on `spawn_io` inside `storage_layer` for more details. @@ -1289,19 +1326,60 @@ impl Timeline { .map(|state| state.collect_pending_ios()) .collect::>(); while collect_futs.next().await.is_some() {} + + // Enrich the missing key error with the original query. + if let GetVectoredError::MissingKey(mut missing_err) = err { + missing_err.enrich(query.clone()); + return Err(GetVectoredError::MissingKey(missing_err)); + } + return Err(err); }; let layers_visited = reconstruct_state.get_layers_visited(); + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "RECONSTRUCT", + ) + }) + .attached_child(); + let futs = FuturesUnordered::new(); for (key, state) in std::mem::take(&mut reconstruct_state.keys) { + let req_lsn_for_key = query.map_key_to_lsn(&key); + futs.push({ let walredo_self = self.myself.upgrade().expect("&self method holds the arc"); + let ctx = RequestContextBuilder::from(&ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "RECONSTRUCT_KEY", + key = %key, + ) + }) + .attached_child(); + async move { assert_eq!(state.situation, ValueReconstructSituation::Complete); - let converted = match state.collect_pending_ios().await { + let res = state + .collect_pending_ios() + .maybe_perf_instrument(&ctx, |crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "WAIT_FOR_IO_COMPLETIONS", + ) + }) + .await; + + let converted = match res { Ok(ok) => ok, Err(err) => { return (key, Err(err)); @@ -1318,16 +1396,27 @@ impl Timeline { "{converted:?}" ); - ( - key, - walredo_self.reconstruct_value(key, lsn, converted).await, - ) + let walredo_deltas = converted.num_deltas(); + let walredo_res = walredo_self + .reconstruct_value(key, req_lsn_for_key, converted, redo_attempt_type) + .maybe_perf_instrument(&ctx, |crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "WALREDO", + deltas = %walredo_deltas, + ) + }) + .await; + + (key, walredo_res) } }); } let results = futs .collect::>>() + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await; // For aux file keys (v1 or v2) the vectored read path does not return an error @@ -1336,15 +1425,18 @@ impl Timeline { // to avoid infinite results. if !results.is_empty() { if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD { + let total_keyspace = query.total_keyspace(); + let max_request_lsn = query.high_watermark_lsn().expect("Validated previously"); + static LOG_PACER: Lazy> = Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60)))); LOG_PACER.lock().unwrap().call(|| { - let num_keys = keyspace.total_raw_size(); + let num_keys = total_keyspace.total_raw_size(); let num_pages = results.len(); tracing::info!( shard_id = %self.tenant_shard_id.shard_slug(), - lsn = %lsn, - "Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.", + lsn = %max_request_lsn, + "Vectored read for {total_keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.", ); }); } @@ -1870,7 +1962,7 @@ impl Timeline { ) .await; if let Err(err) = &res { - log_compaction_error(err, None, cancel.is_cancelled()); + log_compaction_error(err, None, cancel.is_cancelled(), false); } res } @@ -1891,18 +1983,19 @@ impl Timeline { // out by other background tasks (including image compaction). We request this via // `BackgroundLoopKind::L0Compaction`. // - // If this is a regular compaction pass, and L0-only compaction is enabled in the config, - // then we should yield for immediate L0 compaction if necessary while we're waiting for the - // background task semaphore. There's no point yielding otherwise, since we'd just end up - // right back here. + // Yield for pending L0 compaction while waiting for the semaphore. let is_l0_only = options.flags.contains(CompactFlags::OnlyL0Compaction); let semaphore_kind = match is_l0_only && self.get_compaction_l0_semaphore() { true => BackgroundLoopKind::L0Compaction, false => BackgroundLoopKind::Compaction, }; - let yield_for_l0 = !is_l0_only - && self.get_compaction_l0_first() - && !options.flags.contains(CompactFlags::NoYield); + let yield_for_l0 = options.flags.contains(CompactFlags::YieldForL0); + if yield_for_l0 { + // If this is an L0 pass, it doesn't make sense to yield for L0. + debug_assert!(!is_l0_only, "YieldForL0 during L0 pass"); + // If `compaction_l0_first` is disabled, there's no point yielding. + debug_assert!(self.get_compaction_l0_first(), "YieldForL0 without L0 pass"); + } let acquire = async move { let guard = self.compaction_lock.lock().await; @@ -2209,6 +2302,10 @@ impl Timeline { self.remote_client.is_archived() } + pub(crate) fn is_invisible(&self) -> Option { + self.remote_client.is_invisible() + } + pub(crate) fn is_stopping(&self) -> bool { self.current_state() == TimelineState::Stopping } @@ -2231,7 +2328,7 @@ impl Timeline { .await .expect("holding a reference to self"); } - TimelineState::Active { .. } => { + TimelineState::Active => { return Ok(()); } TimelineState::Broken { .. } | TimelineState::Stopping => { @@ -2401,6 +2498,31 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download) } + /// Checks if a get page request should get perf tracing + /// + /// The configuration priority is: tenant config override, default tenant config, + /// pageserver config. + pub(crate) fn is_get_page_request_sampled(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + let ratio = tenant_conf + .tenant_conf + .sampling_ratio + .flatten() + .or(self.conf.default_tenant_conf.sampling_ratio) + .or(self.conf.tracing.as_ref().map(|t| t.sampling_ratio)); + + match ratio { + Some(r) => { + if r.numerator == 0 { + false + } else { + rand::thread_rng().gen_range(0..r.denominator) < r.numerator + } + } + None => false, + } + } + fn get_checkpoint_distance(&self) -> u64 { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2615,6 +2737,10 @@ impl Timeline { .tenant_conf .gc_compaction_enabled .unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled); + let gc_compaction_verification = tenant_conf + .tenant_conf + .gc_compaction_verification + .unwrap_or(self.conf.default_tenant_conf.gc_compaction_verification); let gc_compaction_initial_threshold_kb = tenant_conf .tenant_conf .gc_compaction_initial_threshold_kb @@ -2629,6 +2755,7 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent); GcCompactionCombinedSettings { gc_compaction_enabled, + gc_compaction_verification, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, } @@ -3827,6 +3954,154 @@ impl Timeline { } } +#[derive(Clone)] +/// Type representing a query in the ([`Lsn`], [`Key`]) space. +/// In other words, a set of segments in a 2D space. +/// +/// This representation has the advatange of avoiding hash map +/// allocations for uniform queries. +pub(crate) enum VersionedKeySpaceQuery { + /// Variant for queries at a single [`Lsn`] + Uniform { keyspace: KeySpace, lsn: Lsn }, + /// Variant for queries at multiple [`Lsn`]s + Scattered { + keyspaces_at_lsn: Vec<(Lsn, KeySpace)>, + }, +} + +impl VersionedKeySpaceQuery { + pub(crate) fn uniform(keyspace: KeySpace, lsn: Lsn) -> Self { + Self::Uniform { keyspace, lsn } + } + + pub(crate) fn scattered(keyspaces_at_lsn: Vec<(Lsn, KeySpace)>) -> Self { + Self::Scattered { keyspaces_at_lsn } + } + + /// Returns the most recent (largest) LSN included in the query. + /// If any of the LSNs included in the query are invalid, returns + /// an error instead. + fn high_watermark_lsn(&self) -> Result { + match self { + Self::Uniform { lsn, .. } => { + if !lsn.is_valid() { + return Err(GetVectoredError::InvalidLsn(*lsn)); + } + + Ok(*lsn) + } + Self::Scattered { keyspaces_at_lsn } => { + let mut max_lsn = None; + for (lsn, _keyspace) in keyspaces_at_lsn.iter() { + if !lsn.is_valid() { + return Err(GetVectoredError::InvalidLsn(*lsn)); + } + max_lsn = std::cmp::max(max_lsn, Some(lsn)); + } + + if let Some(computed) = max_lsn { + Ok(*computed) + } else { + Err(GetVectoredError::Other(anyhow!("empty input"))) + } + } + } + } + + /// Returns the total keyspace being queried: the result of projecting + /// everything in the key dimensions onto the key axis. + fn total_keyspace(&self) -> KeySpace { + match self { + Self::Uniform { keyspace, .. } => keyspace.clone(), + Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn + .iter() + .map(|(_lsn, keyspace)| keyspace) + .fold(KeySpace::default(), |mut acc, v| { + acc.merge(v); + acc + }), + } + } + + /// Returns LSN for a specific key. + /// + /// Invariant: requested key must be part of [`Self::total_keyspace`] + pub(super) fn map_key_to_lsn(&self, key: &Key) -> Lsn { + match self { + Self::Uniform { lsn, .. } => *lsn, + Self::Scattered { keyspaces_at_lsn } => { + keyspaces_at_lsn + .iter() + .find(|(_lsn, keyspace)| keyspace.contains(key)) + .expect("Returned key was requested") + .0 + } + } + } + + /// Remove any parts of the query (segments) which overlap with the provided + /// key space (also segments). + fn remove_overlapping_with(&mut self, to_remove: &KeySpace) -> KeySpace { + match self { + Self::Uniform { keyspace, .. } => keyspace.remove_overlapping_with(to_remove), + Self::Scattered { keyspaces_at_lsn } => { + let mut removed_accum = KeySpaceRandomAccum::new(); + keyspaces_at_lsn.iter_mut().for_each(|(_lsn, keyspace)| { + let removed = keyspace.remove_overlapping_with(to_remove); + removed_accum.add_keyspace(removed); + }); + + removed_accum.to_keyspace() + } + } + } + + fn is_empty(&self) -> bool { + match self { + Self::Uniform { keyspace, .. } => keyspace.is_empty(), + Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn + .iter() + .all(|(_lsn, keyspace)| keyspace.is_empty()), + } + } + + /// "Lower" the query on the LSN dimension + fn lower(&mut self, to: Lsn) { + match self { + Self::Uniform { lsn, .. } => { + // If the originally requested LSN is smaller than the starting + // LSN of the ancestor we are descending into, we need to respect that. + // Hence the min. + *lsn = std::cmp::min(*lsn, to); + } + Self::Scattered { keyspaces_at_lsn } => { + keyspaces_at_lsn.iter_mut().for_each(|(lsn, _keyspace)| { + *lsn = std::cmp::min(*lsn, to); + }); + } + } + } +} + +impl std::fmt::Display for VersionedKeySpaceQuery { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[")?; + + match self { + VersionedKeySpaceQuery::Uniform { keyspace, lsn } => { + write!(f, "{keyspace} @ {lsn}")?; + } + VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => { + for (lsn, keyspace) in keyspaces_at_lsn.iter() { + write!(f, "{keyspace} @ {lsn},")?; + } + } + } + + write!(f, "]") + } +} + impl Timeline { #[allow(clippy::doc_lazy_continuation)] /// Get the data needed to reconstruct all keys in the provided keyspace @@ -3841,16 +4116,15 @@ impl Timeline { /// 2.4. If the fringe is empty, go back to 1 async fn get_vectored_reconstruct_data( &self, - mut keyspace: KeySpace, - request_lsn: Lsn, + mut query: VersionedKeySpaceQuery, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { + let original_hwm_lsn = query.high_watermark_lsn().unwrap(); + let mut timeline_owned: Arc; let mut timeline = self; - let mut cont_lsn = Lsn(request_lsn.0 + 1); - let missing_keyspace = loop { if self.cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); @@ -3859,33 +4133,47 @@ impl Timeline { let TimelineVisitOutcome { completed_keyspace: completed, image_covered_keyspace, - } = Self::get_vectored_reconstruct_data_timeline( - timeline, - keyspace.clone(), - cont_lsn, - reconstruct_state, - &self.cancel, - ctx, - ) - .await?; + } = { + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "PLAN_IO_TIMELINE", + timeline = %timeline.timeline_id, + high_watermark_lsn = %query.high_watermark_lsn().unwrap(), + ) + }) + .attached_child(); - keyspace.remove_overlapping_with(&completed); + Self::get_vectored_reconstruct_data_timeline( + timeline, + &query, + reconstruct_state, + &self.cancel, + &ctx, + ) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) + .await? + }; + + query.remove_overlapping_with(&completed); // Do not descend into the ancestor timeline for aux files. // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid // stalling compaction. - keyspace.remove_overlapping_with(&KeySpace { + query.remove_overlapping_with(&KeySpace { ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()], }); // Keyspace is fully retrieved - if keyspace.is_empty() { + if query.is_empty() { break None; } let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else { // Not fully retrieved but no ancestor timeline. - break Some(keyspace); + break Some(query.total_keyspace()); }; // Now we see if there are keys covered by the image layer but does not exist in the @@ -3896,7 +4184,7 @@ impl Timeline { // keys from `keyspace`, we expect there to be no overlap between it and the image covered key // space. If that's not the case, we had at least one key encounter a gap in the image layer // and stop the search as a result of that. - let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace); + let mut removed = query.remove_overlapping_with(&image_covered_keyspace); // Do not fire missing key error and end early for sparse keys. Note that we hava already removed // non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of // figuring out what is the inherited key range and do a fine-grained pruning. @@ -3906,13 +4194,28 @@ impl Timeline { if !removed.is_empty() { break Some(removed); } - // If we reached this point, `remove_overlapping_with` should not have made any change to the - // keyspace. - // Take the min to avoid reconstructing a page with data newer than request Lsn. - cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1)); + // Each key range in the original query is at some point in the LSN space. + // When descending into the ancestor, lower all ranges in the LSN space + // such that new changes on the parent timeline are not visible. + query.lower(timeline.ancestor_lsn); + + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "GET_ANCESTOR", + timeline = %timeline.timeline_id, + ancestor = %ancestor_timeline.timeline_id, + ancestor_lsn = %timeline.ancestor_lsn + ) + }) + .attached_child(); + timeline_owned = timeline - .get_ready_ancestor_timeline(ancestor_timeline, ctx) + .get_ready_ancestor_timeline(ancestor_timeline, &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await?; timeline = &*timeline_owned; }; @@ -3933,22 +4236,47 @@ impl Timeline { }; if let Some(missing_keyspace) = missing_keyspace { - return Err(GetVectoredError::MissingKey(MissingKeyError { - key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */ - shard: self - .shard_identity - .get_shard_number(&missing_keyspace.start().unwrap()), - cont_lsn, - request_lsn, + return Err(GetVectoredError::MissingKey(Box::new(MissingKeyError { + keyspace: missing_keyspace, /* better if we can store the full keyspace */ + shard: self.shard_identity.number, + original_hwm_lsn, ancestor_lsn: Some(timeline.ancestor_lsn), backtrace: None, read_path: std::mem::take(&mut reconstruct_state.read_path), - })); + query: None, + }))); } Ok(()) } + async fn get_vectored_init_fringe( + &self, + query: &VersionedKeySpaceQuery, + ) -> Result { + let mut fringe = LayerFringe::new(); + let guard = self.layers.read().await; + + match query { + VersionedKeySpaceQuery::Uniform { keyspace, lsn } => { + // LSNs requested by the compute or determined by the pageserver + // are inclusive. Queries to the layer map use exclusive LSNs. + // Hence, bump the value before the query - same in the other + // match arm. + let cont_lsn = Lsn(lsn.0 + 1); + guard.update_search_fringe(keyspace, cont_lsn, &mut fringe)?; + } + VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => { + for (lsn, keyspace) in keyspaces_at_lsn.iter() { + let cont_lsn_for_keyspace = Lsn(lsn.0 + 1); + guard.update_search_fringe(keyspace, cont_lsn_for_keyspace, &mut fringe)?; + } + } + } + + Ok(fringe) + } + /// Collect the reconstruct data for a keyspace from the specified timeline. /// /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect @@ -3967,18 +4295,11 @@ impl Timeline { /// decides how to deal with these two keyspaces. async fn get_vectored_reconstruct_data_timeline( timeline: &Timeline, - keyspace: KeySpace, - mut cont_lsn: Lsn, + query: &VersionedKeySpaceQuery, reconstruct_state: &mut ValuesReconstructState, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { - let mut unmapped_keyspace = keyspace.clone(); - let mut fringe = LayerFringe::new(); - - let mut completed_keyspace = KeySpace::default(); - let mut image_covered_keyspace = KeySpaceRandomAccum::new(); - // Prevent GC from progressing while visiting the current timeline. // If we are GC-ing because a new image layer was added while traversing // the timeline, then it will remove layers that are required for fulfilling @@ -3989,11 +4310,37 @@ impl Timeline { // See `compaction::compact_with_gc` for why we need this. let _guard = timeline.gc_compaction_layer_update_lock.read().await; - loop { + // Initialize the fringe + let mut fringe = timeline.get_vectored_init_fringe(query).await?; + + let mut completed_keyspace = KeySpace::default(); + let mut image_covered_keyspace = KeySpaceRandomAccum::new(); + + while let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); } + if let Some(ref mut read_path) = reconstruct_state.read_path { + read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range); + } + + // Visit the layer and plan IOs for it + let next_cont_lsn = lsn_range.start; + layer_to_read + .get_values_reconstruct_data( + keyspace_to_read.clone(), + lsn_range, + reconstruct_state, + ctx, + ) + .await?; + + let mut unmapped_keyspace = keyspace_to_read; + let cont_lsn = next_cont_lsn; + + reconstruct_state.on_layer_visited(&layer_to_read); + let (keys_done_last_step, keys_with_image_coverage) = reconstruct_state.consume_done_keys(); unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); @@ -4004,31 +4351,15 @@ impl Timeline { image_covered_keyspace.add_range(keys_with_image_coverage); } + // Query the layer map for the next layers to read. + // // Do not descent any further if the last layer we visited // completed all keys in the keyspace it inspected. This is not // required for correctness, but avoids visiting extra layers // which turns out to be a perf bottleneck in some cases. if !unmapped_keyspace.is_empty() { let guard = timeline.layers.read().await; - let layers = guard.layer_map()?; - - for range in unmapped_keyspace.ranges.iter() { - let results = layers.range_search(range.clone(), cont_lsn); - - results - .found - .into_iter() - .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { - ( - guard.upgrade(layer), - keyspace_accum.to_keyspace(), - lsn_floor..cont_lsn, - ) - }) - .for_each(|(layer, keyspace, lsn_range)| { - fringe.update(layer, keyspace, lsn_range) - }); - } + guard.update_search_fringe(&unmapped_keyspace, cont_lsn, &mut fringe)?; // It's safe to drop the layer map lock after planning the next round of reads. // The fringe keeps readable handles for the layers which are safe to read even @@ -4042,28 +4373,6 @@ impl Timeline { // at two different time points. drop(guard); } - - if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { - if let Some(ref mut read_path) = reconstruct_state.read_path { - read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range); - } - let next_cont_lsn = lsn_range.start; - layer_to_read - .get_values_reconstruct_data( - keyspace_to_read.clone(), - lsn_range, - reconstruct_state, - ctx, - ) - .await?; - - unmapped_keyspace = keyspace_to_read; - cont_lsn = next_cont_lsn; - - reconstruct_state.on_layer_visited(&layer_to_read); - } else { - break; - } } Ok(TimelineVisitOutcome { @@ -4677,7 +4986,13 @@ impl Timeline { let ctx = ctx.attached_child(); let work = async move { let Some((desc, path)) = frozen_layer - .write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner()) + .write_to_disk( + &ctx, + key_range, + self_clone.l0_flush_global_state.inner(), + &self_clone.gate, + self_clone.cancel.clone(), + ) .await? else { return Ok(None); @@ -4863,13 +5178,11 @@ impl Timeline { if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS || (last_key_in_range && key_request_accum.raw_size() > 0) { + let query = + VersionedKeySpaceQuery::uniform(key_request_accum.consume_keyspace(), lsn); + let results = self - .get_vectored( - key_request_accum.consume_keyspace(), - lsn, - io_concurrency.clone(), - ctx, - ) + .get_vectored(query, io_concurrency.clone(), ctx) .await?; if self.cancel.is_cancelled() { @@ -4958,7 +5271,11 @@ impl Timeline { // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should // not contain too many keys, otherwise this takes a lot of memory. let data = self - .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx) + .get_vectored_impl( + VersionedKeySpaceQuery::uniform(partition.clone(), lsn), + &mut reconstruct_state, + ctx, + ) .await?; let (data, total_kb_retrieved, total_keys_retrieved) = { let mut new_data = BTreeMap::new(); @@ -5215,6 +5532,8 @@ impl Timeline { self.tenant_shard_id, &img_range, lsn, + &self.gate, + self.cancel.clone(), ctx, ) .await?; @@ -6226,10 +6545,17 @@ impl Timeline { key: Key, request_lsn: Lsn, mut data: ValueReconstructState, + redo_attempt_type: RedoAttemptType, ) -> Result { // Perform WAL redo if needed data.records.reverse(); + let fire_critical_error = match redo_attempt_type { + RedoAttemptType::ReadPage => true, + RedoAttemptType::LegacyCompaction => true, + RedoAttemptType::GcCompaction => false, + }; + // If we have a page image, and no WAL, we're all set if data.records.is_empty() { if let Some((img_lsn, img)) = &data.img { @@ -6276,13 +6602,22 @@ impl Timeline { .as_ref() .context("timeline has no walredo manager") .map_err(PageReconstructError::WalRedo)? - .request_redo(key, request_lsn, data.img, data.records, self.pg_version) + .request_redo( + key, + request_lsn, + data.img, + data.records, + self.pg_version, + redo_attempt_type, + ) .await; let img = match res { Ok(img) => img, Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled), Err(walredo::Error::Other(err)) => { - critical!("walredo failure during page reconstruction: {err:?}"); + if fire_critical_error { + critical!("walredo failure during page reconstruction: {err:?}"); + } return Err(PageReconstructError::WalRedo( err.context("reconstruct a page image"), )); @@ -6563,6 +6898,8 @@ impl Timeline { self.tenant_shard_id, &(min_key..end_key), lsn, + &self.gate, + self.cancel.clone(), ctx, ) .await?; @@ -6624,6 +6961,8 @@ impl Timeline { self.tenant_shard_id, deltas.key_range.start, deltas.lsn_range, + &self.gate, + self.cancel.clone(), ctx, ) .await?; @@ -7243,9 +7582,9 @@ mod tests { eprintln!("Downloading {layer} and re-generating heatmap"); - let ctx = &RequestContextBuilder::extend(ctx) + let ctx = &RequestContextBuilder::from(ctx) .download_behavior(crate::context::DownloadBehavior::Download) - .build(); + .attached_child(); let _resident = layer .download_and_keep_resident(ctx) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 711501caa9..3d5f11aeb9 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -7,7 +7,7 @@ use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque}; use std::ops::{Deref, Range}; use std::sync::Arc; -use std::time::Instant; +use std::time::{Duration, Instant}; use super::layer_manager::LayerManager; use super::{ @@ -16,6 +16,8 @@ use super::{ Timeline, }; +use crate::tenant::timeline::DeltaEntry; +use crate::walredo::RedoAttemptType; use anyhow::{Context, anyhow}; use bytes::Bytes; use enumset::EnumSet; @@ -26,7 +28,7 @@ use once_cell::sync::Lazy; use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; use pageserver_api::key::{KEY_SIZE, Key}; use pageserver_api::keyspace::{KeySpace, ShardedRange}; -use pageserver_api::models::CompactInfoResponse; +use pageserver_api::models::{CompactInfoResponse, CompactKeyRange}; use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; use pageserver_api::value::Value; @@ -61,7 +63,7 @@ use crate::tenant::timeline::{ DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer, drop_rlock, }; -use crate::tenant::{DeltaLayer, MaybeOffloaded, gc_block}; +use crate::tenant::{DeltaLayer, MaybeOffloaded}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; /// Maximum number of deltas before generating an image layer in bottom-most compaction. @@ -78,6 +80,7 @@ impl std::fmt::Display for GcCompactionJobId { pub struct GcCompactionCombinedSettings { pub gc_compaction_enabled: bool, + pub gc_compaction_verification: bool, pub gc_compaction_initial_threshold_kb: u64, pub gc_compaction_ratio_percent: u64, } @@ -123,7 +126,6 @@ impl GcCompactionQueueItem { #[derive(Default)] struct GcCompactionGuardItems { notify: Option>, - gc_guard: Option, permit: Option, } @@ -224,6 +226,7 @@ impl GcCompactionQueue { gc_compaction_enabled, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, + .. } = timeline.get_gc_compaction_settings(); if !gc_compaction_enabled { return Ok(()); @@ -279,7 +282,7 @@ impl GcCompactionQueue { gc_compaction_ratio_percent: u64, ) -> bool { const AUTO_TRIGGER_LIMIT: u64 = 150 * 1024 * 1024 * 1024; // 150GB - if l1_size >= AUTO_TRIGGER_LIMIT || l2_size >= AUTO_TRIGGER_LIMIT { + if l1_size + l2_size >= AUTO_TRIGGER_LIMIT { // Do not auto-trigger when physical size >= 150GB return false; } @@ -316,10 +319,18 @@ impl GcCompactionQueue { flags: { let mut flags = EnumSet::new(); flags |= CompactFlags::EnhancedGcBottomMostCompaction; + if timeline.get_compaction_l0_first() { + flags |= CompactFlags::YieldForL0; + } flags }, sub_compaction: true, - compact_key_range: None, + // Only auto-trigger gc-compaction over the data keyspace due to concerns in + // https://github.com/neondatabase/neon/issues/11318. + compact_key_range: Some(CompactKeyRange { + start: Key::MIN, + end: Key::metadata_key_range().start, + }), compact_lsn_range: None, sub_compaction_max_job_size_mb: None, }, @@ -343,44 +354,45 @@ impl GcCompactionQueue { info!("compaction job id={} finished", id); let mut guard = self.inner.lock().unwrap(); if let Some(items) = guard.guards.remove(&id) { - drop(items.gc_guard); if let Some(tx) = items.notify { let _ = tx.send(()); } } } + fn clear_running_job(&self) { + let mut guard = self.inner.lock().unwrap(); + guard.running = None; + } + async fn handle_sub_compaction( &self, id: GcCompactionJobId, options: CompactOptions, timeline: &Arc, - gc_block: &GcBlock, auto: bool, ) -> Result<(), CompactionError> { info!( "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" ); - let jobs = timeline + let res = timeline .gc_compaction_split_jobs( GcCompactJob::from_compact_options(options.clone()), options.sub_compaction_max_job_size_mb, ) - .await?; + .await; + let jobs = match res { + Ok(jobs) => jobs, + Err(err) => { + warn!("cannot split gc-compaction jobs: {}, unblocked gc", err); + self.notify_and_unblock(id); + return Err(err); + } + }; if jobs.is_empty() { info!("no jobs to run, skipping scheduled compaction task"); self.notify_and_unblock(id); } else { - let gc_guard = match gc_block.start().await { - Ok(guard) => guard, - Err(e) => { - return Err(CompactionError::Other(anyhow!( - "cannot run gc-compaction because gc is blocked: {}", - e - ))); - } - }; - let jobs_len = jobs.len(); let mut pending_tasks = Vec::new(); // gc-compaction might pick more layers or fewer layers to compact. The L2 LSN does not need to be accurate. @@ -394,8 +406,8 @@ impl GcCompactionQueue { if job.dry_run { flags |= CompactFlags::DryRun; } - if options.flags.contains(CompactFlags::NoYield) { - flags |= CompactFlags::NoYield; + if options.flags.contains(CompactFlags::YieldForL0) { + flags |= CompactFlags::YieldForL0; } let options = CompactOptions { flags, @@ -415,7 +427,6 @@ impl GcCompactionQueue { { let mut guard = self.inner.lock().unwrap(); - guard.guards.entry(id).or_default().gc_guard = Some(gc_guard); let mut tasks = Vec::new(); for task in pending_tasks { let id = guard.next_id(); @@ -444,9 +455,20 @@ impl GcCompactionQueue { ) -> Result { let res = self.iteration_inner(cancel, ctx, gc_block, timeline).await; if let Err(err) = &res { - log_compaction_error(err, None, cancel.is_cancelled()); + log_compaction_error(err, None, cancel.is_cancelled(), true); + } + match res { + Ok(res) => Ok(res), + Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown), + Err(_) => { + // There are some cases where traditional gc might collect some layer + // files causing gc-compaction cannot read the full history of the key. + // This needs to be resolved in the long-term by improving the compaction + // process. For now, let's simply avoid such errors triggering the + // circuit breaker. + Ok(CompactionOutcome::Skipped) + } } - res } async fn iteration_inner( @@ -494,27 +516,32 @@ impl GcCompactionQueue { info!( "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" ); - self.handle_sub_compaction(id, options, timeline, gc_block, auto) + self.handle_sub_compaction(id, options, timeline, auto) .await?; } else { // Auto compaction always enables sub-compaction so we don't need to handle update_l2_lsn // in this branch. - let gc_guard = match gc_block.start().await { + let _gc_guard = match gc_block.start().await { Ok(guard) => guard, Err(e) => { + self.notify_and_unblock(id); + self.clear_running_job(); return Err(CompactionError::Other(anyhow!( "cannot run gc-compaction because gc is blocked: {}", e ))); } }; - { - let mut guard = self.inner.lock().unwrap(); - guard.guards.entry(id).or_default().gc_guard = Some(gc_guard); - } - let compaction_result = - timeline.compact_with_options(cancel, options, ctx).await?; - self.notify_and_unblock(id); + let res = timeline.compact_with_options(cancel, options, ctx).await; + let compaction_result = match res { + Ok(res) => res, + Err(err) => { + warn!(%err, "failed to run gc-compaction"); + self.notify_and_unblock(id); + self.clear_running_job(); + return Err(err); + } + }; if compaction_result == CompactionOutcome::YieldForL0 { yield_for_l0 = true; } @@ -522,7 +549,25 @@ impl GcCompactionQueue { } GcCompactionQueueItem::SubCompactionJob(options) => { // TODO: error handling, clear the queue if any task fails? - let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?; + let _gc_guard = match gc_block.start().await { + Ok(guard) => guard, + Err(e) => { + self.clear_running_job(); + return Err(CompactionError::Other(anyhow!( + "cannot run gc-compaction because gc is blocked: {}", + e + ))); + } + }; + let res = timeline.compact_with_options(cancel, options, ctx).await; + let compaction_result = match res { + Ok(res) => res, + Err(err) => { + warn!(%err, "failed to run gc-compaction subcompaction job"); + self.clear_running_job(); + return Err(err); + } + }; if compaction_result == CompactionOutcome::YieldForL0 { // We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running // again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because @@ -553,10 +598,7 @@ impl GcCompactionQueue { } } } - { - let mut guard = self.inner.lock().unwrap(); - guard.running = None; - } + self.clear_running_job(); Ok(if yield_for_l0 { tracing::info!("give up gc-compaction: yield for L0 compaction"); CompactionOutcome::YieldForL0 @@ -707,8 +749,8 @@ impl KeyHistoryRetention { async fn pipe_to( self, key: Key, - delta_writer: &mut SplitDeltaLayerWriter, - mut image_writer: Option<&mut SplitImageLayerWriter>, + delta_writer: &mut SplitDeltaLayerWriter<'_>, + mut image_writer: Option<&mut SplitImageLayerWriter<'_>>, stat: &mut CompactionStatistics, ctx: &RequestContext, ) -> anyhow::Result<()> { @@ -748,6 +790,123 @@ impl KeyHistoryRetention { } Ok(()) } + + /// Verify if every key in the retention is readable by replaying the logs. + async fn verify( + &self, + key: Key, + base_img_from_ancestor: &Option<(Key, Lsn, Bytes)>, + full_history: &[(Key, Lsn, Value)], + tline: &Arc, + ) -> anyhow::Result<()> { + // Usually the min_lsn should be the first record but we do a full iteration to be safe. + let Some(min_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).min() else { + // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`. + return Ok(()); + }; + let Some(max_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).max() else { + // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`. + return Ok(()); + }; + let mut base_img = base_img_from_ancestor + .as_ref() + .map(|(_, lsn, img)| (*lsn, img)); + let mut history = Vec::new(); + + async fn collect_and_verify( + key: Key, + lsn: Lsn, + base_img: &Option<(Lsn, &Bytes)>, + history: &[(Lsn, &NeonWalRecord)], + tline: &Arc, + skip_empty: bool, + ) -> anyhow::Result<()> { + if base_img.is_none() && history.is_empty() { + if skip_empty { + return Ok(()); + } + anyhow::bail!("verification failed: key {} has no history at {}", key, lsn); + }; + + let mut records = history + .iter() + .map(|(lsn, val)| (*lsn, (*val).clone())) + .collect::>(); + + // WAL redo requires records in the reverse LSN order + records.reverse(); + let data = ValueReconstructState { + img: base_img.as_ref().map(|(lsn, img)| (*lsn, (*img).clone())), + records, + }; + + tline + .reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction) + .await + .with_context(|| format!("verification failed for key {} at lsn {}", key, lsn))?; + + Ok(()) + } + + for (retain_lsn, KeyLogAtLsn(logs)) in &self.below_horizon { + for (lsn, val) in logs { + match val { + Value::Image(img) => { + base_img = Some((*lsn, img)); + history.clear(); + } + Value::WalRecord(rec) if val.will_init() => { + base_img = None; + history.clear(); + history.push((*lsn, rec)); + } + Value::WalRecord(rec) => { + history.push((*lsn, rec)); + } + } + } + if *retain_lsn >= min_lsn { + // Only verify after the key appears in the full history for the first time. + + // We don't modify history: in theory, we could replace the history with a single + // image as in `generate_key_retention` to make redos at later LSNs faster. But we + // want to verify everything as if they are read from the real layer map. + collect_and_verify(key, *retain_lsn, &base_img, &history, tline, false) + .await + .context("below horizon retain_lsn")?; + } + } + + for (lsn, val) in &self.above_horizon.0 { + match val { + Value::Image(img) => { + // Above the GC horizon, we verify every time we see an image. + collect_and_verify(key, *lsn, &base_img, &history, tline, true) + .await + .context("above horizon full image")?; + base_img = Some((*lsn, img)); + history.clear(); + } + Value::WalRecord(rec) if val.will_init() => { + // Above the GC horizon, we verify every time we see an init record. + collect_and_verify(key, *lsn, &base_img, &history, tline, true) + .await + .context("above horizon init record")?; + base_img = None; + history.clear(); + history.push((*lsn, rec)); + } + Value::WalRecord(rec) => { + history.push((*lsn, rec)); + } + } + } + // Ensure the latest record is readable. + collect_and_verify(key, max_lsn, &base_img, &history, tline, false) + .await + .context("latest record")?; + Ok(()) + } } #[derive(Debug, Serialize, Default)] @@ -784,15 +943,16 @@ pub struct CompactionStatistics { time_acquire_lock_secs: f64, time_analyze_secs: f64, time_download_layer_secs: f64, + time_to_first_kv_pair_secs: f64, time_main_loop_secs: f64, time_final_phase_secs: f64, time_total_secs: f64, // Summary - /// Ratio of the key-value size before/after gc-compaction. - uncompressed_size_ratio: f64, - /// Ratio of the physical size before/after gc-compaction. - physical_size_ratio: f64, + /// Ratio of the key-value size after/before gc-compaction. + uncompressed_retention_ratio: f64, + /// Ratio of the physical size after/before gc-compaction. + compressed_retention_ratio: f64, } impl CompactionStatistics { @@ -861,15 +1021,15 @@ impl CompactionStatistics { fn finalize(&mut self) { let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size; let produced_key_value_size = self.image_produced.size + self.wal_produced.size; - self.uncompressed_size_ratio = - original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0 + self.uncompressed_retention_ratio = + produced_key_value_size as f64 / (original_key_value_size as f64 + 1.0); // avoid div by 0 let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size; let produced_physical_size = self.image_layer_produced.size + self.delta_layer_produced.size + self.image_layer_discarded.size + self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate - self.physical_size_ratio = - original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0 + self.compressed_retention_ratio = + produced_physical_size as f64 / (original_physical_size as f64 + 1.0); // avoid div by 0 } } @@ -983,7 +1143,7 @@ impl Timeline { // Yield if we have pending L0 compaction. The scheduler will do another pass. if (l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0) - && !options.flags.contains(CompactFlags::NoYield) + && options.flags.contains(CompactFlags::YieldForL0) { info!("image/ancestor compaction yielding for L0 compaction"); return Ok(CompactionOutcome::YieldForL0); @@ -1001,9 +1161,9 @@ impl Timeline { { Ok(((dense_partitioning, sparse_partitioning), lsn)) => { // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them - let image_ctx = RequestContextBuilder::extend(ctx) + let image_ctx = RequestContextBuilder::from(ctx) .access_stats_behavior(AccessStatsBehavior::Skip) - .build(); + .attached_child(); let mut partitioning = dense_partitioning; partitioning @@ -1028,7 +1188,7 @@ impl Timeline { .load() .as_ref() .clone(), - !options.flags.contains(CompactFlags::NoYield), + options.flags.contains(CompactFlags::YieldForL0), ) .await .inspect_err(|err| { @@ -1078,7 +1238,17 @@ impl Timeline { // being potentially much longer. let rewrite_max = partition_count; - self.compact_shard_ancestors(rewrite_max, ctx).await?; + let outcome = self + .compact_shard_ancestors( + rewrite_max, + options.flags.contains(CompactFlags::YieldForL0), + ctx, + ) + .await?; + match outcome { + CompactionOutcome::Pending | CompactionOutcome::YieldForL0 => return Ok(outcome), + CompactionOutcome::Done | CompactionOutcome::Skipped => {} + } } Ok(CompactionOutcome::Done) @@ -1095,8 +1265,10 @@ impl Timeline { async fn compact_shard_ancestors( self: &Arc, rewrite_max: usize, + yield_for_l0: bool, ctx: &RequestContext, - ) -> Result<(), CompactionError> { + ) -> Result { + let mut outcome = CompactionOutcome::Done; let mut drop_layers = Vec::new(); let mut layers_to_rewrite: Vec = Vec::new(); @@ -1107,15 +1279,13 @@ impl Timeline { // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we // are rewriting layers. let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn(); - - tracing::info!( - "starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}", - *latest_gc_cutoff, - self.gc_info.read().unwrap().cutoffs.time - ); + let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time; let layers = self.layers.read().await; - for layer_desc in layers.layer_map()?.iter_historic_layers() { + let layers_iter = layers.layer_map()?.iter_historic_layers(); + let (layers_total, mut layers_checked) = (layers_iter.len(), 0); + for layer_desc in layers_iter { + layers_checked += 1; let layer = layers.get_from_desc(&layer_desc); if layer.metadata().shard.shard_count == self.shard_identity.count { // This layer does not belong to a historic ancestor, no need to re-image it. @@ -1130,8 +1300,8 @@ impl Timeline { // This ancestral layer only covers keys that belong to other shards. // We include the full metadata in the log: if we had some critical bug that caused // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers. - info!(%layer, old_metadata=?layer.metadata(), - "dropping layer after shard split, contains no keys for this shard.", + debug!(%layer, old_metadata=?layer.metadata(), + "dropping layer after shard split, contains no keys for this shard", ); if cfg!(debug_assertions) { @@ -1193,29 +1363,52 @@ impl Timeline { } if layers_to_rewrite.len() >= rewrite_max { - tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}", + debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}", layers_to_rewrite.len() ); - continue; + outcome = CompactionOutcome::Pending; + break; } // Fall through: all our conditions for doing a rewrite passed. layers_to_rewrite.push(layer); } - // Drop read lock on layer map before we start doing time-consuming I/O + // Drop read lock on layer map before we start doing time-consuming I/O. drop(layers); + // Drop out early if there's nothing to do. + if layers_to_rewrite.is_empty() && drop_layers.is_empty() { + return Ok(CompactionOutcome::Done); + } + + info!( + "starting shard ancestor compaction, rewriting {} layers and dropping {} layers, \ + checked {layers_checked}/{layers_total} layers \ + (latest_gc_cutoff={} pitr_cutoff={})", + layers_to_rewrite.len(), + drop_layers.len(), + *latest_gc_cutoff, + pitr_cutoff, + ); + let started = Instant::now(); + let mut replace_image_layers = Vec::new(); for layer in layers_to_rewrite { - tracing::info!(layer=%layer, "Rewriting layer after shard split..."); + if self.cancel.is_cancelled() { + return Err(CompactionError::ShuttingDown); + } + + info!(layer=%layer, "rewriting layer after shard split"); let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, &layer.layer_desc().key_range, layer.layer_desc().image_layer_lsn(), + &self.gate, + self.cancel.clone(), ctx, ) .await @@ -1247,7 +1440,7 @@ impl Timeline { .map_err(CompactionError::Other)?; let new_layer = Layer::finish_creating(self.conf, self, desc, &path) .map_err(CompactionError::Other)?; - tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes", + info!(layer=%new_layer, "rewrote layer, {} -> {} bytes", layer.metadata().file_size, new_layer.metadata().file_size); @@ -1257,6 +1450,26 @@ impl Timeline { // the layer has no data for us with the ShardedRange check above, but drop_layers.push(layer); } + + // Yield for L0 compaction if necessary, but make sure we update the layer map below + // with the work we've already done. + if yield_for_l0 + && self + .l0_compaction_trigger + .notified() + .now_or_never() + .is_some() + { + info!("shard ancestor compaction yielding for L0 compaction"); + outcome = CompactionOutcome::YieldForL0; + break; + } + } + + for layer in &drop_layers { + info!(%layer, old_metadata=?layer.metadata(), + "dropping layer after shard split (no keys for this shard)", + ); } // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded @@ -1274,17 +1487,36 @@ impl Timeline { // necessary for correctness, but it simplifies testing, and avoids proceeding with another // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O // load. - match self.remote_client.wait_completion().await { - Ok(()) => (), - Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), - Err(WaitCompletionError::UploadQueueShutDownOrStopped) => { - return Err(CompactionError::ShuttingDown); + if outcome != CompactionOutcome::YieldForL0 { + info!("shard ancestor compaction waiting for uploads"); + tokio::select! { + result = self.remote_client.wait_completion() => match result { + Ok(()) => {}, + Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), + Err(WaitCompletionError::UploadQueueShutDownOrStopped) => { + return Err(CompactionError::ShuttingDown); + } + }, + // Don't wait if there's L0 compaction to do. We don't need to update the outcome + // here, because we've already done the actual work. + _ = self.l0_compaction_trigger.notified(), if yield_for_l0 => {}, } } + info!( + "shard ancestor compaction done in {:.3}s{}", + started.elapsed().as_secs_f64(), + match outcome { + CompactionOutcome::Pending => + format!(", with pending work (rewrite_max={rewrite_max})"), + CompactionOutcome::YieldForL0 => String::from(", yielding for L0 compaction"), + CompactionOutcome::Skipped | CompactionOutcome::Done => String::new(), + } + ); + fail::fail_point!("compact-shard-ancestors-persistent"); - Ok(()) + Ok(outcome) } /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is @@ -1816,6 +2048,8 @@ impl Timeline { debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); lsn_range.clone() }, + &self.gate, + self.cancel.clone(), ctx, ) .await @@ -2103,6 +2337,7 @@ impl Timeline { /// ``` /// /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key. + #[allow(clippy::too_many_arguments)] pub(crate) async fn generate_key_retention( self: &Arc, key: Key, @@ -2111,6 +2346,7 @@ impl Timeline { retain_lsn_below_horizon: &[Lsn], delta_threshold_cnt: usize, base_img_from_ancestor: Option<(Key, Lsn, Bytes)>, + verification: bool, ) -> anyhow::Result { // Pre-checks for the invariants @@ -2197,8 +2433,8 @@ impl Timeline { "should have at least below + above horizon batches" ); let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new(); - if let Some((key, lsn, img)) = base_img_from_ancestor { - replay_history.push((key, lsn, Value::Image(img))); + if let Some((key, lsn, ref img)) = base_img_from_ancestor { + replay_history.push((key, lsn, Value::Image(img.clone()))); } /// Generate debug information for the replay history @@ -2312,22 +2548,15 @@ impl Timeline { // Whether to reconstruct the image. In debug mode, we will generate an image // at every retain_lsn to ensure data is not corrupted, but we won't put the // image into the final layer. - let generate_image = produce_image || debug_mode; - if produce_image { + let img_and_lsn = if produce_image { records_since_last_image = 0; - } - let img_and_lsn = if generate_image { let replay_history_for_debug = if debug_mode { Some(replay_history.clone()) } else { None }; let replay_history_for_debug_ref = replay_history_for_debug.as_deref(); - let history = if produce_image { - std::mem::take(&mut replay_history) - } else { - replay_history.clone() - }; + let history = std::mem::take(&mut replay_history); let mut img = None; let mut records = Vec::with_capacity(history.len()); if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() { @@ -2362,6 +2591,7 @@ impl Timeline { records.push((lsn, rec)); } } + // WAL redo requires records in the reverse LSN order records.reverse(); let state = ValueReconstructState { img, records }; // last batch does not generate image so i is always in range, unless we force generate @@ -2371,7 +2601,9 @@ impl Timeline { } else { lsn_split_points[i] }; - let img = self.reconstruct_value(key, request_lsn, state).await?; + let img = self + .reconstruct_value(key, request_lsn, state, RedoAttemptType::GcCompaction) + .await?; Some((request_lsn, img)) } else { None @@ -2392,10 +2624,16 @@ impl Timeline { assert_eq!(retention.len(), lsn_split_points.len() + 1); for (idx, logs) in retention.into_iter().enumerate() { if idx == lsn_split_points.len() { - return Ok(KeyHistoryRetention { + let retention = KeyHistoryRetention { below_horizon: result, above_horizon: KeyLogAtLsn(logs), - }); + }; + if verification { + retention + .verify(key, &base_img_from_ancestor, full_history, self) + .await?; + } + return Ok(retention); } else { result.push((lsn_split_points[idx], KeyLogAtLsn(logs))); } @@ -2635,7 +2873,7 @@ impl Timeline { ) -> Result { let sub_compaction = options.sub_compaction; let job = GcCompactJob::from_compact_options(options.clone()); - let no_yield = options.flags.contains(CompactFlags::NoYield); + let yield_for_l0 = options.flags.contains(CompactFlags::YieldForL0); if sub_compaction { info!( "running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" @@ -2650,7 +2888,7 @@ impl Timeline { idx + 1, jobs_len ); - self.compact_with_gc_inner(cancel, job, ctx, no_yield) + self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0) .await?; } if jobs_len == 0 { @@ -2658,7 +2896,8 @@ impl Timeline { } return Ok(CompactionOutcome::Done); } - self.compact_with_gc_inner(cancel, job, ctx, no_yield).await + self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0) + .await } async fn compact_with_gc_inner( @@ -2666,7 +2905,7 @@ impl Timeline { cancel: &CancellationToken, job: GcCompactJob, ctx: &RequestContext, - no_yield: bool, + yield_for_l0: bool, ) -> Result { // Block other compaction/GC tasks from running for now. GC-compaction could run along // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc. @@ -2861,6 +3100,9 @@ impl Timeline { } (false, res) }; + + let verification = self.get_gc_compaction_settings().gc_compaction_verification; + info!( "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}", job_desc.selected_layers.len(), @@ -2936,18 +3178,15 @@ impl Timeline { if cancel.is_cancelled() { return Err(CompactionError::ShuttingDown); } - if !no_yield { - let should_yield = self + let should_yield = yield_for_l0 + && self .l0_compaction_trigger .notified() .now_or_never() .is_some(); - if should_yield { - tracing::info!( - "preempt gc-compaction when downloading layers: too many L0 layers" - ); - return Ok(CompactionOutcome::YieldForL0); - } + if should_yield { + tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers"); + return Ok(CompactionOutcome::YieldForL0); } let resident_layer = layer .download_and_keep_resident(ctx) @@ -2993,7 +3232,7 @@ impl Timeline { .map_err(CompactionError::Other)?; let time_download_layer = timer.elapsed(); - let timer = Instant::now(); + let mut timer = Instant::now(); // Step 2: Produce images+deltas. let mut accumulated_values = Vec::new(); @@ -3010,6 +3249,8 @@ impl Timeline { job_desc.compaction_key_range.start, lowest_retain_lsn, self.get_compaction_target_size(), + &self.gate, + self.cancel.clone(), ctx, ) .await @@ -3026,6 +3267,8 @@ impl Timeline { self.tenant_shard_id, lowest_retain_lsn..end_lsn, self.get_compaction_target_size(), + &self.gate, + self.cancel.clone(), ) .await .context("failed to create delta layer writer") @@ -3068,8 +3311,7 @@ impl Timeline { // Actually, we can decide not to write to the image layer at all at this point because // the key and LSN range are determined. However, to keep things simple here, we still // create this writer, and discard the writer in the end. - - let mut keys_processed = 0; + let mut time_to_first_kv_pair = None; while let Some(((key, lsn, val), desc)) = merge_iter .next_with_trace() @@ -3077,25 +3319,24 @@ impl Timeline { .context("failed to get next key-value pair") .map_err(CompactionError::Other)? { + if time_to_first_kv_pair.is_none() { + time_to_first_kv_pair = Some(timer.elapsed()); + timer = Instant::now(); + } + if cancel.is_cancelled() { return Err(CompactionError::ShuttingDown); } - if !no_yield { - keys_processed += 1; - if keys_processed % 1000 == 0 { - let should_yield = self - .l0_compaction_trigger - .notified() - .now_or_never() - .is_some(); - if should_yield { - tracing::info!( - "preempt gc-compaction in the main loop: too many L0 layers" - ); - return Ok(CompactionOutcome::YieldForL0); - } - } + let should_yield = yield_for_l0 + && self + .l0_compaction_trigger + .notified() + .now_or_never() + .is_some(); + if should_yield { + tracing::info!("preempt gc-compaction in the main loop: too many L0 layers"); + return Ok(CompactionOutcome::YieldForL0); } if self.shard_identity.is_key_disposable(&key) { // If this shard does not need to store this key, simply skip it. @@ -3124,6 +3365,8 @@ impl Timeline { self.tenant_shard_id, desc.key_range.start, desc.lsn_range.clone(), + &self.gate, + self.cancel.clone(), ctx, ) .await @@ -3141,6 +3384,8 @@ impl Timeline { self.tenant_shard_id, job_desc.compaction_key_range.end, desc.lsn_range.clone(), + &self.gate, + self.cancel.clone(), ctx, ) .await @@ -3182,6 +3427,7 @@ impl Timeline { .await .context("failed to get ancestor image") .map_err(CompactionError::Other)?, + verification, ) .await .context("failed to generate key retention") @@ -3222,6 +3468,7 @@ impl Timeline { .await .context("failed to get ancestor image") .map_err(CompactionError::Other)?, + verification, ) .await .context("failed to generate key retention") @@ -3418,6 +3665,9 @@ impl Timeline { let time_final_phase = timer.elapsed(); stat.time_final_phase_secs = time_final_phase.as_secs_f64(); + stat.time_to_first_kv_pair_secs = time_to_first_kv_pair + .unwrap_or(Duration::ZERO) + .as_secs_f64(); stat.time_main_loop_secs = time_main_loop.as_secs_f64(); stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64(); stat.time_download_layer_secs = time_download_layer.as_secs_f64(); @@ -3707,6 +3957,8 @@ impl CompactionJobExecutor for TimelineAdaptor { self.timeline.tenant_shard_id, key_range.start, lsn_range.clone(), + &self.timeline.gate, + self.timeline.cancel.clone(), ctx, ) .await?; @@ -3782,6 +4034,8 @@ impl TimelineAdaptor { self.timeline.tenant_shard_id, key_range, lsn, + &self.timeline.gate, + self.timeline.cancel.clone(), ctx, ) .await?; @@ -3878,8 +4132,6 @@ impl CompactionLayer for OwnArc { } } -use crate::tenant::timeline::DeltaEntry; - impl CompactionLayer for ResidentDeltaLayer { fn key_range(&self) -> &Range { &self.0.layer_desc().key_range diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 740f590735..64fcf1fe0d 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -410,10 +410,13 @@ impl DeleteTimelineFlow { // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted. // However, we handle this case in tenant loading code so the next time we attach, the issue is // resolved. - tenant.store_tenant_manifest().await.map_err(|e| match e { - TenantManifestError::Cancelled => DeleteTimelineError::Cancelled, - _ => DeleteTimelineError::Other(e.into()), - })?; + tenant + .maybe_upload_tenant_manifest() + .await + .map_err(|err| match err { + TenantManifestError::Cancelled => DeleteTimelineError::Cancelled, + err => DeleteTimelineError::Other(err.into()), + })?; *guard = Self::Finished; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index ac9d9a4579..a841cc55f0 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -2,10 +2,14 @@ use std::collections::HashSet; use std::sync::Arc; use anyhow::Context; +use bytes::Bytes; use http_utils::error::ApiError; +use pageserver_api::key::Key; +use pageserver_api::keyspace::KeySpace; use pageserver_api::models::DetachBehavior; use pageserver_api::models::detach_ancestor::AncestorDetached; use pageserver_api::shard::ShardIdentity; +use pageserver_compaction::helpers::overlaps_with; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::Instrument; @@ -22,7 +26,11 @@ use crate::task_mgr::TaskKind; use crate::tenant::Tenant; use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor; use crate::tenant::storage_layer::layer::local_layer_path; -use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}; +use crate::tenant::storage_layer::{ + AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer, + ValuesReconstructState, +}; +use crate::tenant::timeline::VersionedKeySpaceQuery; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; #[derive(Debug, thiserror::Error)] @@ -170,6 +178,90 @@ impl Attempt { } } +async fn generate_tombstone_image_layer( + detached: &Arc, + ancestor: &Arc, + ancestor_lsn: Lsn, + ctx: &RequestContext, +) -> Result, Error> { + tracing::info!( + "removing non-inherited keys by writing an image layer with tombstones at the detach LSN" + ); + let io_concurrency = IoConcurrency::spawn_from_conf( + detached.conf, + detached.gate.enter().map_err(|_| Error::ShuttingDown)?, + ); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); + // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should + // not contain too many keys, otherwise this takes a lot of memory. Currently we limit it to 10k keys in the compute. + let key_range = Key::sparse_non_inherited_keyspace(); + // avoid generating a "future layer" which will then be removed + let image_lsn = ancestor_lsn; + + { + let layers = detached.layers.read().await; + for layer in layers.all_persistent_layers() { + if !layer.is_delta + && layer.lsn_range.start == image_lsn + && overlaps_with(&key_range, &layer.key_range) + { + tracing::warn!( + layer=%layer, "image layer at the detach LSN already exists, skipping removing aux files" + ); + return Ok(None); + } + } + } + + let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key_range.clone()), image_lsn); + let data = ancestor + .get_vectored_impl(query, &mut reconstruct_state, ctx) + .await + .context("failed to retrieve aux keys") + .map_err(|e| Error::launder(e, Error::Prepare))?; + if !data.is_empty() { + // TODO: is it possible that we can have an image at `image_lsn`? Unlikely because image layers are only generated + // upon compaction but theoretically possible. + let mut image_layer_writer = ImageLayerWriter::new( + detached.conf, + detached.timeline_id, + detached.tenant_shard_id, + &key_range, + image_lsn, + &detached.gate, + detached.cancel.clone(), + ctx, + ) + .await + .context("failed to create image layer writer") + .map_err(Error::Prepare)?; + for key in data.keys() { + image_layer_writer + .put_image(*key, Bytes::new(), ctx) + .await + .context("failed to write key") + .map_err(|e| Error::launder(e, Error::Prepare))?; + } + let (desc, path) = image_layer_writer + .finish(ctx) + .await + .context("failed to finish image layer writer for removing the metadata keys") + .map_err(|e| Error::launder(e, Error::Prepare))?; + let generated = Layer::finish_creating(detached.conf, detached, desc, &path) + .map_err(|e| Error::launder(e, Error::Prepare))?; + detached + .remote_client + .upload_layer_file(&generated, &detached.cancel) + .await + .map_err(|e| Error::launder(e, Error::Prepare))?; + tracing::info!(layer=%generated, "wrote image layer"); + Ok(Some(generated)) + } else { + tracing::info!("no aux keys found in ancestor"); + Ok(None) + } +} + /// See [`Timeline::prepare_to_detach_from_ancestor`] pub(super) async fn prepare( detached: &Arc, @@ -235,7 +327,7 @@ pub(super) async fn prepare( return Err(NoAncestor); } - check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?; + check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn, behavior)?; if let DetachBehavior::MultiLevelAndNoReparent = behavior { // If the ancestor has an ancestor, we might be able to fast-path detach it if the current ancestor does not have any data written/used by the detaching timeline. @@ -249,7 +341,13 @@ pub(super) async fn prepare( ancestor_lsn = ancestor.ancestor_lsn; // Get the LSN first before resetting the `ancestor` variable ancestor = ancestor_of_ancestor; // TODO: do we still need to check if we don't want to reparent? - check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?; + check_no_archived_children_of_ancestor( + tenant, + detached, + &ancestor, + ancestor_lsn, + behavior, + )?; } } else if ancestor.ancestor_timeline.is_some() { // non-technical requirement; we could flatten N ancestors just as easily but we chose @@ -346,10 +444,16 @@ pub(super) async fn prepare( // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after let mut new_layers: Vec = - Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len()); + Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len() + 1); + + if let Some(tombstone_layer) = + generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, ctx).await? + { + new_layers.push(tombstone_layer.into()); + } { - tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers"); + tracing::info!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers"); let mut tasks = tokio::task::JoinSet::new(); @@ -674,6 +778,8 @@ async fn copy_lsn_prefix( target_timeline.tenant_shard_id, layer.layer_desc().key_range.start, layer.layer_desc().lsn_range.start..end_lsn, + &target_timeline.gate, + target_timeline.cancel.clone(), ctx, ) .await @@ -1156,31 +1262,44 @@ fn check_no_archived_children_of_ancestor( detached: &Arc, ancestor: &Arc, ancestor_lsn: Lsn, + detach_behavior: DetachBehavior, ) -> Result<(), Error> { - let timelines = tenant.timelines.lock().unwrap(); - let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); - for timeline in reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn) { - if timeline.is_archived() == Some(true) { - return Err(Error::Archived(timeline.timeline_id)); - } - } - for timeline_offloaded in timelines_offloaded.values() { - if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) { - continue; - } - // This forbids the detach ancestor feature if flattened timelines are present, - // even if the ancestor_lsn is from after the branchpoint of the detached timeline. - // But as per current design, we don't record the ancestor_lsn of flattened timelines. - // This is a bit unfortunate, but as of writing this we don't support flattening - // anyway. Maybe we can evolve the data model in the future. - if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn { - let is_earlier = retain_lsn <= ancestor_lsn; - if !is_earlier { - continue; + match detach_behavior { + DetachBehavior::NoAncestorAndReparent => { + let timelines = tenant.timelines.lock().unwrap(); + let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); + + for timeline in + reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn) + { + if timeline.is_archived() == Some(true) { + return Err(Error::Archived(timeline.timeline_id)); + } + } + + for timeline_offloaded in timelines_offloaded.values() { + if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) { + continue; + } + // This forbids the detach ancestor feature if flattened timelines are present, + // even if the ancestor_lsn is from after the branchpoint of the detached timeline. + // But as per current design, we don't record the ancestor_lsn of flattened timelines. + // This is a bit unfortunate, but as of writing this we don't support flattening + // anyway. Maybe we can evolve the data model in the future. + if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn { + let is_earlier = retain_lsn <= ancestor_lsn; + if !is_earlier { + continue; + } + } + return Err(Error::Archived(timeline_offloaded.timeline_id)); } } - return Err(Error::Archived(timeline_offloaded.timeline_id)); + DetachBehavior::MultiLevelAndNoReparent => { + // We don't need to check anything if the user requested to not reparent. + } } + Ok(()) } diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index 3ef82b3658..c6d2944769 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -738,6 +738,8 @@ impl ChunkProcessingJob { self.timeline.tenant_shard_id, &self.range, self.pgdata_lsn, + &self.timeline.gate, + self.timeline.cancel.clone(), ctx, ) .await?; diff --git a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs index 7c7a4de2fc..352bbbc4d4 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs @@ -32,9 +32,15 @@ impl Client { let Some(ref base_url) = conf.import_pgdata_upcall_api else { anyhow::bail!("import_pgdata_upcall_api is not configured") }; + let mut http_client = reqwest::Client::builder(); + for cert in &conf.ssl_ca_certs { + http_client = http_client.add_root_certificate(cert.clone()); + } + let http_client = http_client.build()?; + Ok(Self { base_url: base_url.to_string(), - client: reqwest::Client::new(), + client: http_client, cancel, authorization_header: conf .import_pgdata_upcall_api_token diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index ed92ea28ce..ae898260d2 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -3,17 +3,18 @@ use std::sync::Arc; use anyhow::{Context, bail, ensure}; use itertools::Itertools; +use pageserver_api::keyspace::KeySpace; use pageserver_api::shard::TenantShardId; use tokio_util::sync::CancellationToken; use tracing::trace; use utils::id::TimelineId; use utils::lsn::{AtomicLsn, Lsn}; -use super::{ReadableLayer, TimelineWriterState}; +use super::{LayerFringe, ReadableLayer, TimelineWriterState}; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::metrics::TimelineMetrics; -use crate::tenant::layer_map::{BatchedUpdates, LayerMap}; +use crate::tenant::layer_map::{BatchedUpdates, LayerMap, SearchResult}; use crate::tenant::storage_layer::{ AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc, PersistentLayerKey, ReadableLayerWeak, ResidentLayer, @@ -38,7 +39,7 @@ impl Default for LayerManager { } impl LayerManager { - pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer { + fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer { match weak { ReadableLayerWeak::PersistentLayer(desc) => { ReadableLayer::PersistentLayer(self.get_from_desc(&desc)) @@ -147,6 +148,36 @@ impl LayerManager { self.layers().keys().cloned().collect_vec() } + /// Update the [`LayerFringe`] of a read request + /// + /// Take a key space at a given LSN and query the layer map below each range + /// of the key space to find the next layers to visit. + pub(crate) fn update_search_fringe( + &self, + keyspace: &KeySpace, + cont_lsn: Lsn, + fringe: &mut LayerFringe, + ) -> Result<(), Shutdown> { + let map = self.layer_map()?; + + for range in keyspace.ranges.iter() { + let results = map.range_search(range.clone(), cont_lsn); + results + .found + .into_iter() + .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { + ( + self.upgrade(layer), + keyspace_accum.to_keyspace(), + lsn_floor..cont_lsn, + ) + }) + .for_each(|(layer, keyspace, lsn_range)| fringe.update(layer, keyspace, lsn_range)); + } + + Ok(()) + } + fn layers(&self) -> &HashMap { use LayerManager::*; match self { diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 43ffaa6aab..f46f1676c9 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -111,7 +111,7 @@ pub(crate) async fn offload_timeline( // at the next restart attach it again. // For that to happen, we'd need to make the manifest reflect our *intended* state, // not our actual state of offloaded timelines. - tenant.store_tenant_manifest().await?; + tenant.maybe_upload_tenant_manifest().await?; tracing::info!("Timeline offload complete (remaining arc refcount: {remaining_refcount})"); diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index df2663f6bb..3c3608d1bd 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -580,6 +580,7 @@ impl ConnectionManagerState { ); Ok(()) } + WalReceiverError::Cancelled => Ok(()), WalReceiverError::Other(e) => { // give out an error to have task_mgr give it a really verbose logging if cancellation.is_cancelled() { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index f41a9cfe82..52259f205b 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -73,6 +73,7 @@ pub(super) enum WalReceiverError { /// Generic error Other(anyhow::Error), ClosedGate, + Cancelled, } impl From for WalReceiverError { @@ -200,6 +201,9 @@ pub(super) async fn handle_walreceiver_connection( // with a similar error. }, WalReceiverError::SuccessfulCompletion(_) => {} + WalReceiverError::Cancelled => { + debug!("Connection cancelled") + } WalReceiverError::ClosedGate => { // doesn't happen at runtime } @@ -273,7 +277,12 @@ pub(super) async fn handle_walreceiver_connection( let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); - let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?; + let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx) + .await + .map_err(|e| match e.kind { + crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled, + _ => WalReceiverError::Other(e.into()), + })?; let shard = vec![*timeline.get_shard_identity()]; @@ -445,7 +454,7 @@ pub(super) async fn handle_walreceiver_connection( .inspect_err(|err| { // TODO: we can't differentiate cancellation errors with // anyhow::Error, so just ignore it if we're cancelled. - if !cancellation.is_cancelled() { + if !cancellation.is_cancelled() && !timeline.is_stopping() { critical!("{err:?}") } })?; @@ -577,7 +586,7 @@ pub(super) async fn handle_walreceiver_connection( .inspect_err(|err| { // TODO: we can't differentiate cancellation errors with // anyhow::Error, so just ignore it if we're cancelled. - if !cancellation.is_cancelled() { + if !cancellation.is_cancelled() && !timeline.is_stopping() { critical!("{err:?}") } })?; diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index d5dc9666ce..be1b55ffa3 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -302,6 +302,7 @@ pub struct UploadQueueStoppedDeletable { pub(super) deleted_at: SetDeletedFlagProgress, } +#[allow(clippy::large_enum_variant, reason = "TODO")] pub enum UploadQueueStopped { Deletable(UploadQueueStoppedDeletable), Uninitialized, diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index df5c911e50..3ee1a3c162 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -25,8 +25,8 @@ impl AlignedBufferMut> { /// * `align` must be a power of two, /// /// * `capacity`, when rounded up to the nearest multiple of `align`, - /// must not overflow isize (i.e., the rounded value must be - /// less than or equal to `isize::MAX`). + /// must not overflow isize (i.e., the rounded value must be + /// less than or equal to `isize::MAX`). pub fn with_capacity(capacity: usize) -> Self { AlignedBufferMut { raw: RawAlignedBuffer::with_capacity(capacity), diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs index 97a6c4049a..d273772411 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs @@ -37,8 +37,8 @@ impl RawAlignedBuffer> { /// * `align` must be a power of two, /// /// * `capacity`, when rounded up to the nearest multiple of `align`, - /// must not overflow isize (i.e., the rounded value must be - /// less than or equal to `isize::MAX`). + /// must not overflow isize (i.e., the rounded value must be + /// less than or equal to `isize::MAX`). pub fn with_capacity(capacity: usize) -> Self { let align = ConstAlign::; let layout = Layout::from_size_align(capacity, align.align()).expect("Invalid layout"); diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 18df065f76..e60c590f87 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -21,13 +21,13 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. +use std::backtrace::Backtrace; use std::collections::HashMap; use std::sync::{Arc, OnceLock}; use std::time::{Duration, Instant, SystemTime}; -use anyhow::{Result, bail}; use bytes::{Buf, Bytes}; -use pageserver_api::key::rel_block_to_key; +use pageserver_api::key::{Key, rel_block_to_key}; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; @@ -38,7 +38,7 @@ use postgres_ffi::{ fsm_logical_to_physical, pg_constants, }; use tracing::*; -use utils::bin_ser::SerializeError; +use utils::bin_ser::{DeserializeError, SerializeError}; use utils::lsn::Lsn; use utils::rate_limit::RateLimit; use utils::{critical, failpoint_support}; @@ -104,12 +104,101 @@ struct WarnIngestLag { timestamp_invalid_msg_ratelimit: RateLimit, } +pub struct WalIngestError { + pub backtrace: std::backtrace::Backtrace, + pub kind: WalIngestErrorKind, +} + +#[derive(thiserror::Error, Debug)] +pub enum WalIngestErrorKind { + #[error(transparent)] + #[allow(private_interfaces)] + PageReconstructError(#[from] PageReconstructError), + #[error(transparent)] + DeserializationFailure(#[from] DeserializeError), + #[error(transparent)] + SerializationFailure(#[from] SerializeError), + #[error("the request contains data not supported by pageserver: {0} @ {1}")] + InvalidKey(Key, Lsn), + #[error("twophase file for xid {0} already exists")] + FileAlreadyExists(u64), + #[error("slru segment {0:?}/{1} already exists")] + SlruAlreadyExists(SlruKind, u32), + #[error("relation already exists")] + RelationAlreadyExists(RelTag), + #[error("invalid reldir key {0}")] + InvalidRelDirKey(Key), + + #[error(transparent)] + LogicalError(anyhow::Error), + #[error(transparent)] + EncodeAuxFileError(anyhow::Error), + #[error(transparent)] + MaybeRelSizeV2Error(anyhow::Error), + + #[error("timeline shutting down")] + Cancelled, +} + +impl From for WalIngestError +where + WalIngestErrorKind: From, +{ + fn from(value: T) -> Self { + WalIngestError { + backtrace: Backtrace::capture(), + kind: WalIngestErrorKind::from(value), + } + } +} + +impl std::error::Error for WalIngestError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + self.kind.source() + } +} + +impl core::fmt::Display for WalIngestError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + self.kind.fmt(f) + } +} + +impl core::fmt::Debug for WalIngestError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + if f.alternate() { + f.debug_map() + .key(&"backtrace") + .value(&self.backtrace) + .key(&"kind") + .value(&self.kind) + .finish() + } else { + writeln!(f, "Error: {:?}", self.kind)?; + if self.backtrace.status() == std::backtrace::BacktraceStatus::Captured { + writeln!(f, "Stack backtrace: {:?}", self.backtrace)?; + } + Ok(()) + } + } +} + +#[macro_export] +macro_rules! ensure_walingest { + ($($t:tt)*) => { + _ = || -> Result<(), anyhow::Error> { + anyhow::ensure!($($t)*); + Ok(()) + }().map_err(WalIngestErrorKind::LogicalError)?; + }; +} + impl WalIngest { pub async fn new( timeline: &Timeline, startpoint: Lsn, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?; @@ -145,7 +234,7 @@ impl WalIngest { interpreted: InterpretedWalRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { WAL_INGEST.records_received.inc(); let prev_len = modification.len(); @@ -288,7 +377,7 @@ impl WalIngest { } /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL - fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result { + fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result { let next_full_xid = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value }); @@ -298,9 +387,9 @@ impl WalIngest { if xid > next_xid { // Wraparound occurred, must be from a prev epoch. if epoch == 0 { - bail!( + Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!( "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}" - ); + )))?; } epoch -= 1; } @@ -313,7 +402,7 @@ impl WalIngest { clear_vm_bits: ClearVmBits, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let ClearVmBits { new_heap_blkno, old_heap_blkno, @@ -402,7 +491,7 @@ impl WalIngest { create: DbaseCreate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let DbaseCreate { db_id, tablespace_id, @@ -505,7 +594,7 @@ impl WalIngest { dbase_drop: DbaseDrop, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let DbaseDrop { db_id, tablespace_ids, @@ -523,7 +612,7 @@ impl WalIngest { create: SmgrCreate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let SmgrCreate { rel } = create; self.put_rel_creation(modification, rel, ctx).await?; Ok(()) @@ -537,7 +626,7 @@ impl WalIngest { truncate: XlSmgrTruncate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let XlSmgrTruncate { blkno, rnode, @@ -689,7 +778,7 @@ impl WalIngest { record: XactRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let (xact_common, is_commit, is_prepared) = match record { XactRecord::Prepare(XactPrepare { xl_xid, data }) => { let xid: u64 = if modification.tline.pg_version >= 17 { @@ -813,7 +902,7 @@ impl WalIngest { truncate: ClogTruncate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let ClogTruncate { pageno, oldest_xid, @@ -889,7 +978,7 @@ impl WalIngest { zero_page: ClogZeroPage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let ClogZeroPage { segno, rpageno } = zero_page; self.put_slru_page_image( @@ -907,7 +996,7 @@ impl WalIngest { &mut self, modification: &mut DatadirModification, xlrec: &XlMultiXactCreate, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { // Create WAL record for updating the multixact-offsets page let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -1010,7 +1099,7 @@ impl WalIngest { modification: &mut DatadirModification<'_>, xlrec: &XlMultiXactTruncate, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let (maxsegment, startsegment, endsegment) = enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { cp.oldestMulti = xlrec.end_trunc_off; @@ -1058,7 +1147,7 @@ impl WalIngest { zero_page: MultiXactZeroPage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let MultiXactZeroPage { slru_kind, segno, @@ -1080,7 +1169,7 @@ impl WalIngest { update: RelmapUpdate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let RelmapUpdate { update, buf } = update; modification @@ -1093,7 +1182,7 @@ impl WalIngest { raw_record: RawXlogRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let RawXlogRecord { info, lsn, mut buf } = raw_record; let pg_version = modification.tline.pg_version; @@ -1235,12 +1324,12 @@ impl WalIngest { put: PutLogicalMessage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let PutLogicalMessage { path, buf } = put; modification.put_file(path.as_str(), &buf, ctx).await } - fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> { + fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<(), WalIngestError> { match record { StandbyRecord::RunningXacts(running_xacts) => { enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { @@ -1258,7 +1347,7 @@ impl WalIngest { &mut self, record: ReploriginRecord, modification: &mut DatadirModification<'_>, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { match record { ReploriginRecord::Set(set) => { modification @@ -1278,7 +1367,7 @@ impl WalIngest { modification: &mut DatadirModification<'_>, rel: RelTag, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { modification.put_rel_creation(rel, 0, ctx).await?; Ok(()) } @@ -1291,7 +1380,7 @@ impl WalIngest { blknum: BlockNumber, img: Bytes, ctx: &RequestContext, - ) -> Result<(), PageReconstructError> { + ) -> Result<(), WalIngestError> { self.handle_rel_extend(modification, rel, blknum, ctx) .await?; modification.put_rel_page_image(rel, blknum, img)?; @@ -1305,7 +1394,7 @@ impl WalIngest { blknum: BlockNumber, rec: NeonWalRecord, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { self.handle_rel_extend(modification, rel, blknum, ctx) .await?; modification.put_rel_wal_record(rel, blknum, rec)?; @@ -1318,7 +1407,7 @@ impl WalIngest { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { modification.put_rel_truncation(rel, nblocks, ctx).await?; Ok(()) } @@ -1329,7 +1418,7 @@ impl WalIngest { rel: RelTag, blknum: BlockNumber, ctx: &RequestContext, - ) -> Result<(), PageReconstructError> { + ) -> Result<(), WalIngestError> { let new_nblocks = blknum + 1; // Check if the relation exists. We implicitly create relations on first // record. @@ -1423,7 +1512,7 @@ impl WalIngest { blknum: BlockNumber, img: Bytes, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { if !self.shard.is_shard_zero() { return Ok(()); } @@ -1441,7 +1530,7 @@ impl WalIngest { segno: u32, blknum: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // we don't use a cache for this like we do for relations. SLRUS are explcitly // extended with ZEROPAGE records, not with commit records, so it happens // a lot less frequently. @@ -1509,6 +1598,7 @@ async fn get_relsize( #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { + use anyhow::Result; use postgres_ffi::RELSEG_SIZE; use super::*; @@ -1530,7 +1620,7 @@ mod tests { } #[tokio::test] - async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> { + async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> { for i in 14..=16 { dispatch_pgversion!(i, { pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 22d8d83811..ed8a954369 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -136,6 +136,16 @@ macro_rules! bail { } } +#[derive(Debug, Clone, Copy)] +pub enum RedoAttemptType { + /// Used for the read path. Will fire critical errors and retry twice if failure. + ReadPage, + // Used for legacy compaction (only used in image compaction). Will fire critical errors and retry once if failure. + LegacyCompaction, + // Used for gc compaction. Will not fire critical errors and not retry. + GcCompaction, +} + /// /// Public interface of WAL redo manager /// @@ -156,11 +166,18 @@ impl PostgresRedoManager { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, pg_version: u32, + redo_attempt_type: RedoAttemptType, ) -> Result { if records.is_empty() { bail!("invalid WAL redo request with no records"); } + let max_retry_attempts = match redo_attempt_type { + RedoAttemptType::ReadPage => 2, + RedoAttemptType::LegacyCompaction => 1, + RedoAttemptType::GcCompaction => 0, + }; + let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID); let mut img = base_img.map(|p| p.1); let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1); @@ -180,6 +197,7 @@ impl PostgresRedoManager { &records[batch_start..i], self.conf.wal_redo_timeout, pg_version, + max_retry_attempts, ) .await }; @@ -201,6 +219,7 @@ impl PostgresRedoManager { &records[batch_start..], self.conf.wal_redo_timeout, pg_version, + max_retry_attempts, ) .await } @@ -424,11 +443,11 @@ impl PostgresRedoManager { records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, pg_version: u32, + max_retry_attempts: u32, ) -> Result { *(self.last_redo_at.lock().unwrap()) = Some(Instant::now()); let (rel, blknum) = key.to_rel_block().context("invalid record")?; - const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { let base_img = &base_img; @@ -486,7 +505,7 @@ impl PostgresRedoManager { info!(n_attempts, "retried walredo succeeded"); } n_attempts += 1; - if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() { + if n_attempts > max_retry_attempts || result.is_ok() { return result; } } @@ -560,6 +579,7 @@ mod tests { use super::PostgresRedoManager; use crate::config::PageServerConf; + use crate::walredo::RedoAttemptType; #[tokio::test] async fn test_ping() { @@ -593,6 +613,7 @@ mod tests { None, short_records(), 14, + RedoAttemptType::ReadPage, ) .instrument(h.span()) .await @@ -621,6 +642,7 @@ mod tests { None, short_records(), 14, + RedoAttemptType::ReadPage, ) .instrument(h.span()) .await @@ -642,6 +664,7 @@ mod tests { None, short_records(), 16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */ + RedoAttemptType::ReadPage, ) .instrument(h.span()) .await diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index 61ae1eb970..a3840f1f6f 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -276,6 +276,7 @@ pub(crate) fn apply_in_neon( append, clear, will_init, + only_if, } => { use bytes::BufMut; if *will_init { @@ -288,6 +289,13 @@ pub(crate) fn apply_in_neon( if *clear { page.clear(); } + if let Some(only_if) = only_if { + if page != only_if.as_bytes() { + return Err(anyhow::anyhow!( + "the current image does not match the expected image, cannot append" + )); + } + } page.put_slice(append.as_bytes()); } } diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 8259d24359..426b176af9 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -4,6 +4,7 @@ MODULE_big = neon OBJS = \ $(WIN32RES) \ + communicator.o \ extension_server.o \ file_cache.o \ hll.o \ diff --git a/pgxn/neon/bitmap.h b/pgxn/neon/bitmap.h index 0a131816ef..21efd13547 100644 --- a/pgxn/neon/bitmap.h +++ b/pgxn/neon/bitmap.h @@ -9,4 +9,4 @@ #define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) #define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) -#endif //NEON_BITMAP_H +#endif /* NEON_BITMAP_H */ diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c new file mode 100644 index 0000000000..932034e22e --- /dev/null +++ b/pgxn/neon/communicator.c @@ -0,0 +1,2504 @@ +/*------------------------------------------------------------------------- + * + * communicator.c + * Functions for communicating with remote pageservers. + * + * This is the so-called "legacy" communicator. It consists of functions that + * are called from the smgr implementation, in pagestore_smgr.c. There are + * plans to replace this with a different implementation, see RFC. + * + * The communicator is a collection of functions that are called in each + * backend, when the backend needs to read a page or other information. It + * does not spawn background threads or anything like that. To process + * responses to prefetch requests in a timely fashion, however, it registers + * a ProcessInterrupts hook that gets called periodically from any + * CHECK_FOR_INTERRUPTS() point in the backend. + * + * By the time the functions in this file are called, the caller has already + * established that a request to the pageserver is necessary. The functions + * are only called for permanent relations (i.e. not temp or unlogged tables). + * Before making a call to the communicator, the caller has already checked + * the relation size or local file cache. + * + * However, when processing responses to getpage requests, the communicator + * writes pages directly to the LFC. + * + * The communicator functions take request LSNs as arguments; the caller is + * responsible for determining the correct LSNs to use. There's one exception + * to that, in prefetch_do_request(); it sometimes calls back to + * neon_get_request_lsns(). That's because sometimes a suitable response is + * found in the prefetch buffer and the request LSns are not needed, and the + * caller doesn't know whether it's needed or not. + * + * The main interface consists of the following "synchronous" calls: + * + * communicator_exists - Returns true if a relation file exists + * communicator_nblocks - Returns a relation's size + * communicator_dbsize - Returns a databases's total size + * communicator_read_at_lsnv - Read contents of one relation block + * communicator_read_slru_segment - Read contents of one SLRU segment + * + * In addition, there functions related to prefetching: + * communicator_prefetch_register_bufferv - Start prefetching a page + * communicator_prefetch_lookupv - Check if a page is already in prefetch queue + * + * Misc other functions: + * - communicator_init - Initialize the module at startup + * - communicator_prefetch_pump_state - Called periodically to advance the state + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlog.h" +#include "access/xlogdefs.h" +#include "access/xlog_internal.h" +#include "access/xlogutils.h" +#include "common/hashfn.h" +#include "executor/instrument.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "port/pg_iovec.h" +#include "postmaster/interrupt.h" +#include "replication/walsender.h" +#include "utils/timeout.h" + +#include "bitmap.h" +#include "communicator.h" +#include "file_cache.h" +#include "neon.h" +#include "neon_perf_counters.h" +#include "pagestore_client.h" + +#if PG_VERSION_NUM >= 150000 +#include "access/xlogrecovery.h" +#endif + +#if PG_VERSION_NUM < 160000 +typedef PGAlignedBlock PGIOAlignedBlock; +#endif + +#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \ + neon_shard_log(shard_no, elvl, "Broken connection state: " message, \ + ##__VA_ARGS__) + +page_server_api *page_server; + +static uint32 local_request_counter; +#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter) + +/* + * Various settings related to prompt (fast) handling of PageStream responses + * at any CHECK_FOR_INTERRUPTS point. + */ +int readahead_getpage_pull_timeout_ms = 0; +static int PS_TIMEOUT_ID = 0; +static bool timeout_set = false; +static bool timeout_signaled = false; + +/* + * We have a CHECK_FOR_INTERRUPTS in page_server->receive(), and we don't want + * that to handle any getpage responses if we're already working on the + * backlog of those, as we'd hit issues with determining which prefetch slot + * we just got a response for. + * + * To protect against that, we have this variable that's set whenever we start + * receiving data for prefetch slots, so that we don't get confused. + * + * Note that in certain error cases during readpage we may leak r_r_g=true, + * which results in a failure to pick up further responses until we first + * actively try to receive new getpage responses. + */ +static bool readpage_reentrant_guard = false; + +static void pagestore_timeout_handler(void); + +#define START_PREFETCH_RECEIVE_WORK() \ + do { \ + readpage_reentrant_guard = true; \ + } while (false) + +#define END_PREFETCH_RECEIVE_WORK() \ + do { \ + readpage_reentrant_guard = false; \ + if (unlikely(timeout_signaled && !InterruptPending)) \ + InterruptPending = true; \ + } while (false) + +/* + * Prefetch implementation: + * + * Prefetch is performed locally by each backend. + * + * There can be up to readahead_buffer_size active IO requests registered at + * any time. Requests using smgr_prefetch are sent to the pageserver, but we + * don't wait on the response. Requests using smgr_read are either read from + * the buffer, or (if that's not possible) we wait on the response to arrive - + * this also will allow us to receive other prefetched pages. + * Each request is immediately written to the output buffer of the pageserver + * connection, but may not be flushed if smgr_prefetch is used: pageserver + * flushes sent requests on manual flush, or every neon.flush_output_after + * unflushed requests; which is not necessarily always and all the time. + * + * Once we have received a response, this value will be stored in the response + * buffer, indexed in a hash table. This allows us to retain our buffered + * prefetch responses even when we have cache misses. + * + * Reading of prefetch responses is delayed until them are actually needed + * (smgr_read). In case of prefetch miss or any other SMGR request other than + * smgr_read, all prefetch responses in the pipeline will need to be read from + * the connection; the responses are stored for later use. + * + * NOTE: The current implementation of the prefetch system implements a ring + * buffer of up to readahead_buffer_size requests. If there are more _read and + * _prefetch requests between the initial _prefetch and the _read of a buffer, + * the prefetch request will have been dropped from this prefetch buffer, and + * your prefetch was wasted. + */ + +/* + * State machine: + * + * not in hash : in hash + * : + * UNUSED ------> REQUESTED --> RECEIVED + * ^ : | | + * | : v | + * | : TAG_REMAINS | + * | : | | + * +----------------+------------+ + * : + */ +typedef enum PrefetchStatus +{ + PRFS_UNUSED = 0, /* unused slot */ + PRFS_REQUESTED, /* request was written to the sendbuffer to + * PS, but not necessarily flushed. all fields + * except response valid */ + PRFS_RECEIVED, /* all fields valid */ + PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still + * valid */ +} PrefetchStatus; + +/* must fit in uint8; bits 0x1 are used */ +typedef enum { + PRFSF_NONE = 0x0, + PRFSF_LFC = 0x1 /* received prefetch result is stored in LFC */ +} PrefetchRequestFlags; + +typedef struct PrefetchRequest +{ + BufferTag buftag; /* must be first entry in the struct */ + shardno_t shard_no; + uint8 status; /* see PrefetchStatus for valid values */ + uint8 flags; /* see PrefetchRequestFlags */ + neon_request_lsns request_lsns; + NeonRequestId reqid; + NeonResponse *response; /* may be null */ + uint64 my_ring_index; +} PrefetchRequest; + +/* prefetch buffer lookup hash table */ + +typedef struct PrfHashEntry +{ + PrefetchRequest *slot; + uint32 status; + uint32 hash; +} PrfHashEntry; + +#define SH_PREFIX prfh +#define SH_ELEMENT_TYPE PrfHashEntry +#define SH_KEY_TYPE PrefetchRequest * +#define SH_KEY slot +#define SH_STORE_HASH +#define SH_GET_HASH(tb, a) ((a)->hash) +#define SH_HASH_KEY(tb, key) hash_bytes( \ + ((const unsigned char *) &(key)->buftag), \ + sizeof(BufferTag) \ +) + +#define SH_EQUAL(tb, a, b) (BufferTagsEqual(&(a)->buftag, &(b)->buftag)) +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + +/* + * PrefetchState maintains the state of (prefetch) getPage@LSN requests. + * It maintains a (ring) buffer of in-flight requests and responses. + * + * We maintain several indexes into the ring buffer: + * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0 + * + * ring_unused points to the first unused slot of the buffer + * ring_receive is the next request that is to be received + * ring_last is the oldest received entry in the buffer + * + * Apart from being an entry in the ring buffer of prefetch requests, each + * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag. + */ +typedef struct PrefetchState +{ + MemoryContext bufctx; /* context for prf_buffer[].response + * allocations */ + MemoryContext errctx; /* context for prf_buffer[].response + * allocations */ + MemoryContext hashctx; /* context for prf_buffer */ + + /* buffer indexes */ + uint64 ring_unused; /* first unused slot */ + uint64 ring_flush; /* next request to flush */ + uint64 ring_receive; /* next slot that is to receive a response */ + uint64 ring_last; /* min slot with a response value */ + + /* metrics / statistics */ + int n_responses_buffered; /* count of PS responses not yet in + * buffers */ + int n_requests_inflight; /* count of PS requests considered in + * flight */ + int n_unused; /* count of buffers < unused, > last, that are + * also unused */ + + /* the buffers */ + prfh_hash *prf_hash; + int max_shard_no; + /* Mark shards involved in prefetch */ + uint8 shard_bitmap[(MAX_SHARDS + 7)/8]; + PrefetchRequest prf_buffer[]; /* prefetch buffers */ +} PrefetchState; + +static PrefetchState *MyPState; + +#define GetPrfSlotNoCheck(ring_index) ( \ + &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ +) + +#define GetPrfSlot(ring_index) ( \ + ( \ + AssertMacro((ring_index) < MyPState->ring_unused && \ + (ring_index) >= MyPState->ring_last), \ + GetPrfSlotNoCheck(ring_index) \ + ) \ +) + +#define ReceiveBufferNeedsCompaction() (\ + (MyPState->n_responses_buffered / 8) < ( \ + MyPState->ring_receive - \ + MyPState->ring_last - \ + MyPState->n_responses_buffered \ + ) \ +) + +static process_interrupts_callback_t prev_interrupt_cb; + +static bool compact_prefetch_buffers(void); +static void consume_prefetch_responses(void); +static uint64 prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask, + bool is_prefetch); +static bool prefetch_read(PrefetchRequest *slot); +static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns); +static bool prefetch_wait_for(uint64 ring_index); +static void prefetch_cleanup_trailing_unused(void); +static inline void prefetch_set_unused(uint64 ring_index); + +static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, + PrefetchRequest *slot); +static bool communicator_processinterrupts(void); + +void +pg_init_communicator(void) +{ + prev_interrupt_cb = ProcessInterruptsCallback; + ProcessInterruptsCallback = communicator_processinterrupts; +} + +static bool +compact_prefetch_buffers(void) +{ + uint64 empty_ring_index = MyPState->ring_last; + uint64 search_ring_index = MyPState->ring_receive; + int n_moved = 0; + + if (MyPState->ring_receive == MyPState->ring_last) + return false; + + while (search_ring_index > MyPState->ring_last) + { + search_ring_index--; + if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED) + { + empty_ring_index = search_ring_index; + break; + } + } + + /* + * Here we have established: slots < search_ring_index have an unknown + * state (not scanned) slots >= search_ring_index and <= empty_ring_index + * are unused slots > empty_ring_index are in use, or outside our buffer's + * range. ... unless search_ring_index <= ring_last + * + * Therefore, there is a gap of at least one unused items between + * search_ring_index and empty_ring_index (both inclusive), which grows as + * we hit more unused items while moving backwards through the array. + */ + + while (search_ring_index > MyPState->ring_last) + { + PrefetchRequest *source_slot; + PrefetchRequest *target_slot; + bool found; + + /* update search index to an unprocessed entry */ + search_ring_index--; + + source_slot = GetPrfSlot(search_ring_index); + + if (source_slot->status == PRFS_UNUSED) + continue; + + /* slot is used -- start moving slot */ + target_slot = GetPrfSlot(empty_ring_index); + + Assert(source_slot->status == PRFS_RECEIVED); + Assert(target_slot->status == PRFS_UNUSED); + + target_slot->buftag = source_slot->buftag; + target_slot->shard_no = source_slot->shard_no; + target_slot->status = source_slot->status; + target_slot->flags = source_slot->flags; + target_slot->response = source_slot->response; + target_slot->reqid = source_slot->reqid; + target_slot->request_lsns = source_slot->request_lsns; + target_slot->my_ring_index = empty_ring_index; + + prfh_delete(MyPState->prf_hash, source_slot); + prfh_insert(MyPState->prf_hash, target_slot, &found); + + Assert(!found); + + /* Adjust the location of our known-empty slot */ + empty_ring_index--; + + /* empty the moved slot */ + source_slot->status = PRFS_UNUSED; + source_slot->buftag = (BufferTag) + { + 0 + }; + source_slot->response = NULL; + source_slot->my_ring_index = 0; + source_slot->request_lsns = (neon_request_lsns) { + InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr + }; + + /* update bookkeeping */ + n_moved++; + } + + /* + * Only when we've moved slots we can expect trailing unused slots, so + * only then we clean up trailing unused slots. + */ + if (n_moved > 0) + { + prefetch_cleanup_trailing_unused(); + return true; + } + + return false; +} + +/* + * If there might be responses still in the TCP buffer, then we should try to + * use those, to reduce any TCP backpressure on the OS/PS side. + * + * This procedure handles that. + * + * Note that this works because we don't pipeline non-getPage requests. + * + * NOTE: This procedure is not allowed to throw errors that should be handled + * by SMGR-related code, as this can be called from every CHECK_FOR_INTERRUPTS + * point inside and outside PostgreSQL. + * + * This still does throw errors when it receives malformed responses from PS. + * + * When we're not called from CHECK_FOR_INTERRUPTS (indicated by + * IsHandlingInterrupts) we also report we've ended prefetch receive work, + * just in case state tracking was lost due to an error in the sync getPage + * response code. + */ +void +communicator_prefetch_pump_state(bool IsHandlingInterrupts) +{ + while (MyPState->ring_receive != MyPState->ring_flush) + { + NeonResponse *response; + PrefetchRequest *slot; + MemoryContext old; + + slot = GetPrfSlot(MyPState->ring_receive); + + old = MemoryContextSwitchTo(MyPState->errctx); + response = page_server->try_receive(slot->shard_no); + MemoryContextSwitchTo(old); + + if (response == NULL) + break; + + /* The slot should still be valid */ + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(slot->shard_no, ERROR, + "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", + slot->status, slot->response, + (long) slot->my_ring_index, (long) MyPState->ring_receive); + + /* update prefetch state */ + MyPState->n_responses_buffered += 1; + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + + /* update slot state */ + slot->status = PRFS_RECEIVED; + slot->response = response; + + if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) + { + /* + * Store prefetched result in LFC (please read comments to lfc_prefetch + * explaining why it can be done without holding shared buffer lock + */ + if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) + { + slot->flags |= PRFSF_LFC; + } + } + } + + /* We never pump the prefetch state while handling other pages */ + if (!IsHandlingInterrupts) + END_PREFETCH_RECEIVE_WORK(); + + communicator_reconfigure_timeout_if_needed(); +} + +void +readahead_buffer_resize(int newsize, void *extra) +{ + uint64 end, + nfree = newsize; + PrefetchState *newPState; + Size newprfs_size = offsetof(PrefetchState, prf_buffer) + + (sizeof(PrefetchRequest) * newsize); + + /* don't try to re-initialize if we haven't initialized yet */ + if (MyPState == NULL) + return; + + /* + * Make sure that we don't lose track of active prefetch requests by + * ensuring we have received all but the last n requests (n = newsize). + */ + if (MyPState->n_requests_inflight > newsize) + { + prefetch_wait_for(MyPState->ring_unused - newsize - 1); + Assert(MyPState->n_requests_inflight <= newsize); + } + + /* construct the new PrefetchState, and copy over the memory contexts */ + newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size); + + newPState->bufctx = MyPState->bufctx; + newPState->errctx = MyPState->errctx; + newPState->hashctx = MyPState->hashctx; + newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL); + newPState->n_unused = newsize; + newPState->n_requests_inflight = 0; + newPState->n_responses_buffered = 0; + newPState->ring_last = newsize; + newPState->ring_unused = newsize; + newPState->ring_receive = newsize; + newPState->max_shard_no = MyPState->max_shard_no; + memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap)); + + /* + * Copy over the prefetches. + * + * We populate the prefetch array from the end; to retain the most recent + * prefetches, but this has the benefit of only needing to do one + * iteration on the dataset, and trivial compaction. + */ + for (end = MyPState->ring_unused - 1; + end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0; + end -= 1) + { + PrefetchRequest *slot = GetPrfSlot(end); + PrefetchRequest *newslot; + bool found; + + if (slot->status == PRFS_UNUSED) + continue; + + nfree -= 1; + + newslot = &newPState->prf_buffer[nfree]; + *newslot = *slot; + newslot->my_ring_index = nfree; + + prfh_insert(newPState->prf_hash, newslot, &found); + + Assert(!found); + + switch (newslot->status) + { + case PRFS_UNUSED: + pg_unreachable(); + case PRFS_REQUESTED: + newPState->n_requests_inflight += 1; + newPState->ring_receive -= 1; + newPState->ring_last -= 1; + break; + case PRFS_RECEIVED: + newPState->n_responses_buffered += 1; + newPState->ring_last -= 1; + break; + case PRFS_TAG_REMAINS: + newPState->ring_last -= 1; + break; + } + newPState->n_unused -= 1; + } + newPState->ring_flush = newPState->ring_receive; + + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + + for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) + { + PrefetchRequest *slot = GetPrfSlot(end); + Assert(slot->status != PRFS_REQUESTED); + if (slot->status == PRFS_RECEIVED) + { + pfree(slot->response); + } + } + + prfh_destroy(MyPState->prf_hash); + pfree(MyPState); + MyPState = newPState; +} + + + +/* + * Make sure that there are no responses still in the buffer. + * + * This function may indirectly update MyPState->pfs_hash; which invalidates + * any active pointers into the hash table. + */ +static void +consume_prefetch_responses(void) +{ + if (MyPState->ring_receive < MyPState->ring_unused) + prefetch_wait_for(MyPState->ring_unused - 1); +} + +static void +prefetch_cleanup_trailing_unused(void) +{ + uint64 ring_index; + PrefetchRequest *slot; + + while (MyPState->ring_last < MyPState->ring_receive) + { + ring_index = MyPState->ring_last; + slot = GetPrfSlot(ring_index); + + if (slot->status == PRFS_UNUSED) + MyPState->ring_last += 1; + else + break; + } +} + + +static bool +prefetch_flush_requests(void) +{ + for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++) + { + if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no)) + { + if (!page_server->flush(shard_no)) + return false; + BITMAP_CLR(MyPState->shard_bitmap, shard_no); + } + } + MyPState->max_shard_no = 0; + return true; +} + +/* + * Wait for slot of ring_index to have received its response. + * The caller is responsible for making sure the request buffer is flushed. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. + * NOTE: callers should make sure they can handle query cancellations in this + * function's call path. + */ +static bool +prefetch_wait_for(uint64 ring_index) +{ + PrefetchRequest *entry; + bool result = true; + + if (MyPState->ring_flush <= ring_index && + MyPState->ring_unused > MyPState->ring_flush) + { + if (!prefetch_flush_requests()) + return false; + MyPState->ring_flush = MyPState->ring_unused; + } + + Assert(MyPState->ring_unused > ring_index); + + while (MyPState->ring_receive <= ring_index) + { + START_PREFETCH_RECEIVE_WORK(); + entry = GetPrfSlot(MyPState->ring_receive); + + Assert(entry->status == PRFS_REQUESTED); + if (!prefetch_read(entry)) + { + result = false; + break; + } + + END_PREFETCH_RECEIVE_WORK(); + CHECK_FOR_INTERRUPTS(); + } + + return result; +} + +/* + * Read the response of a prefetch request into its slot. + * + * The caller is responsible for making sure that the request for this buffer + * was flushed to the PageServer. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. + * + * NOTE: this does IO, and can get canceled out-of-line. + */ +static bool +prefetch_read(PrefetchRequest *slot) +{ + NeonResponse *response; + MemoryContext old; + BufferTag buftag; + shardno_t shard_no; + uint64 my_ring_index; + + Assert(slot->status == PRFS_REQUESTED); + Assert(slot->response == NULL); + Assert(slot->my_ring_index == MyPState->ring_receive); + + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(slot->shard_no, ERROR, + "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu", + slot->status, slot->response, + (long)slot->my_ring_index, (long)MyPState->ring_receive); + + /* + * Copy the request info so that if an error happens and the prefetch + * queue is flushed during the receive call, we can print the original + * values in the error message + */ + buftag = slot->buftag; + shard_no = slot->shard_no; + my_ring_index = slot->my_ring_index; + + old = MemoryContextSwitchTo(MyPState->errctx); + response = (NeonResponse *) page_server->receive(shard_no); + MemoryContextSwitchTo(old); + if (response) + { + /* The slot should still be valid */ + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(shard_no, ERROR, + "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", + slot->status, slot->response, + (long) slot->my_ring_index, (long) MyPState->ring_receive); + + /* update prefetch state */ + MyPState->n_responses_buffered += 1; + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + + /* update slot state */ + slot->status = PRFS_RECEIVED; + slot->response = response; + + if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) + { + /* + * Store prefetched result in LFC (please read comments to lfc_prefetch + * explaining why it can be done without holding shared buffer lock + */ + if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) + { + slot->flags |= PRFSF_LFC; + } + } + return true; + } + else + { + /* + * Note: The slot might no longer be valid, if the connection was lost + * and the prefetch queue was flushed during the receive call + */ + neon_shard_log(shard_no, LOG, + "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect", + (long) my_ring_index, + RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)), + buftag.forkNum, buftag.blockNum); + return false; + } +} + +/* + * Disconnect hook - drop prefetches when the connection drops + * + * If we don't remove the failed prefetches, we'd be serving incorrect + * data to the smgr. + */ +void +prefetch_on_ps_disconnect(void) +{ + MyPState->ring_flush = MyPState->ring_unused; + + while (MyPState->ring_receive < MyPState->ring_unused) + { + PrefetchRequest *slot; + uint64 ring_index = MyPState->ring_receive; + + slot = GetPrfSlot(ring_index); + + Assert(slot->status == PRFS_REQUESTED); + Assert(slot->my_ring_index == ring_index); + + /* + * Drop connection to all shards which have prefetch requests. + * It is not a problem to call disconnect multiple times on the same connection + * because disconnect implementation in libpagestore.c will check if connection + * is alive and do nothing of connection was already dropped. + */ + page_server->disconnect(slot->shard_no); + + /* clean up the request */ + slot->status = PRFS_TAG_REMAINS; + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + + prefetch_set_unused(ring_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; + } + + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; +} + +/* + * prefetch_set_unused() - clear a received prefetch slot + * + * The slot at ring_index must be a current member of the ring buffer, + * and may not be in the PRFS_REQUESTED state. + * + * NOTE: this function will update MyPState->pfs_hash; which invalidates any + * active pointers into the hash table. + */ +static inline void +prefetch_set_unused(uint64 ring_index) +{ + PrefetchRequest *slot; + + if (ring_index < MyPState->ring_last) + return; /* Should already be unused */ + + slot = GetPrfSlot(ring_index); + if (slot->status == PRFS_UNUSED) + return; + + Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS); + + if (slot->status == PRFS_RECEIVED) + { + pfree(slot->response); + slot->response = NULL; + + MyPState->n_responses_buffered -= 1; + MyPState->n_unused += 1; + + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + } + else + { + Assert(slot->response == NULL); + } + + prfh_delete(MyPState->prf_hash, slot); + + /* clear all fields */ + MemSet(slot, 0, sizeof(PrefetchRequest)); + slot->status = PRFS_UNUSED; + + /* run cleanup if we're holding back ring_last */ + if (MyPState->ring_last == ring_index) + prefetch_cleanup_trailing_unused(); + + /* + * ... and try to store the buffered responses more compactly if > 12.5% + * of the buffer is gaps + */ + else if (ReceiveBufferNeedsCompaction()) + compact_prefetch_buffers(); +} + +/* + * Send one prefetch request to the pageserver. To wait for the response, call + * prefetch_wait_for(). + */ +static void +prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns) +{ + bool found; + uint64 mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index; + + NeonGetPageRequest request = { + .hdr.tag = T_NeonGetPageRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + /* lsn and not_modified_since are filled in below */ + .rinfo = BufTagGetNRelFileInfo(slot->buftag), + .forknum = slot->buftag.forkNum, + .blkno = slot->buftag.blockNum, + }; + + Assert(mySlotNo == MyPState->ring_unused); + + slot->reqid = request.hdr.reqid; + + if (force_request_lsns) + slot->request_lsns = *force_request_lsns; + else + neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, slot->buftag.blockNum, + &slot->request_lsns, 1); + request.hdr.lsn = slot->request_lsns.request_lsn; + request.hdr.not_modified_since = slot->request_lsns.not_modified_since; + + Assert(slot->response == NULL); + Assert(slot->my_ring_index == MyPState->ring_unused); + + while (!page_server->send(slot->shard_no, (NeonRequest *) &request)) + { + Assert(mySlotNo == MyPState->ring_unused); + /* loop */ + } + + /* update prefetch state */ + MyPState->n_requests_inflight += 1; + MyPState->n_unused -= 1; + MyPState->ring_unused += 1; + BITMAP_SET(MyPState->shard_bitmap, slot->shard_no); + MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no); + + /* update slot state */ + slot->status = PRFS_REQUESTED; + prfh_insert(MyPState->prf_hash, slot, &found); + Assert(!found); +} + +/* + * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted. + * Present pages are marked in "mask" bitmap and total number of such pages is returned. + */ +int +communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, + neon_request_lsns *lsns, BlockNumber nblocks, + void **buffers, bits8 *mask) +{ + int hits = 0; + PrefetchRequest hashkey; + + /* + * Use an intermediate PrefetchRequest struct as the hash key to ensure + * correct alignment and that the padding bytes are cleared. + */ + memset(&hashkey.buftag, 0, sizeof(BufferTag)); + CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); + hashkey.buftag.forkNum = forknum; + + for (int i = 0; i < nblocks; i++) + { + PrfHashEntry *entry; + + hashkey.buftag.blockNum = blocknum + i; + entry = prfh_lookup(MyPState->prf_hash, &hashkey); + + if (entry != NULL) + { + PrefetchRequest *slot = entry->slot; + uint64 ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); + + if (slot->status != PRFS_RECEIVED) + continue; + + /* + * If the caller specified a request LSN to use, only accept + * prefetch responses that satisfy that request. + */ + if (!neon_prefetch_response_usable(&lsns[i], slot)) + continue; + + /* + * Ignore errors + */ + if (slot->response->tag != T_NeonGetPageResponse) + { + if (slot->response->tag != T_NeonErrorResponse) + { + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag); + } + continue; + } + memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); + + + /* + * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received + * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here + * under buffer lock. + */ + if (!lfc_store_prefetch_result) + lfc_write(rinfo, forknum, blocknum + i, buffers[i]); + + prefetch_set_unused(ring_index); + BITMAP_SET(mask, i); + + hits += 1; + inc_getpage_wait(0); + } + } + pgBufferUsage.prefetch.hits += hits; + return hits; +} + +/* + * prefetch_register_bufferv() - register and prefetch buffers + * + * Register that we may want the contents of BufferTag in the near future. + * This is used when issuing a speculative prefetch request, but also when + * performing a synchronous request and need the buffer right now. + * + * If force_request_lsns is not NULL, those values are sent to the + * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure + * to calculate the LSNs to send. + * + * Bits set in *mask (if present) indicate pages already read; i.e. pages we + * can skip in this process. + * + * When performing a prefetch rather than a synchronous request, + * is_prefetch==true. Currently, it only affects how the request is accounted + * in the perf counters. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. + */ +void +communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask) +{ + uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; + + ring_index = prefetch_register_bufferv(tag, frlsns, nblocks, mask, true); + + Assert(ring_index < MyPState->ring_unused && + MyPState->ring_last <= ring_index); +} + +/* internal version. Returns the ring index */ +static uint64 +prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask, + bool is_prefetch) +{ + uint64 min_ring_index; + PrefetchRequest hashkey; +#ifdef USE_ASSERT_CHECKING + bool any_hits = false; +#endif + /* We will never read further ahead than our buffer can store. */ + nblocks = Max(1, Min(nblocks, readahead_buffer_size)); + + /* + * Use an intermediate PrefetchRequest struct as the hash key to ensure + * correct alignment and that the padding bytes are cleared. + */ + memset(&hashkey.buftag, 0, sizeof(BufferTag)); + hashkey.buftag = tag; + +Retry: + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + + min_ring_index = UINT64_MAX; + for (int i = 0; i < nblocks; i++) + { + PrefetchRequest *slot = NULL; + PrfHashEntry *entry = NULL; + uint64 ring_index; + neon_request_lsns *lsns; + + if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) + continue; + + if (frlsns) + lsns = &frlsns[i]; + else + lsns = NULL; + +#ifdef USE_ASSERT_CHECKING + any_hits = true; +#endif + + slot = NULL; + entry = NULL; + + hashkey.buftag.blockNum = tag.blockNum + i; + entry = prfh_lookup(MyPState->prf_hash, &hashkey); + + if (entry != NULL) + { + slot = entry->slot; + ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); + + /* + * If the caller specified a request LSN to use, only accept + * prefetch responses that satisfy that request. + */ + if (!is_prefetch) + { + if (!neon_prefetch_response_usable(lsns, slot)) + { + /* Wait for the old request to finish and discard it */ + if (!prefetch_wait_for(ring_index)) + goto Retry; + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; + } + } + + if (entry != NULL) + { + /* + * We received a prefetch for a page that was recently read + * and removed from the buffers. Remove that request from the + * buffers. + */ + if (slot->status == PRFS_TAG_REMAINS) + { + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + } + else + { + min_ring_index = Min(min_ring_index, ring_index); + /* The buffered request is good enough, return that index */ + if (is_prefetch) + pgBufferUsage.prefetch.duplicates++; + continue; + } + } + } + else if (!is_prefetch) + { + pgBufferUsage.prefetch.misses += 1; + MyNeonCounters->getpage_prefetch_misses_total++; + } + /* + * We can only leave the block above by finding that there's + * no entry that can satisfy this request, either because there + * was no entry, or because the entry was invalid or didn't satisfy + * the LSNs provided. + * + * The code should've made sure to clear up the data. + */ + Assert(entry == NULL); + Assert(slot == NULL); + + /* There should be no buffer overflow */ + Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused); + + /* + * If the prefetch queue is full, we need to make room by clearing the + * oldest slot. If the oldest slot holds a buffer that was already + * received, we can just throw it away; we fetched the page + * unnecessarily in that case. If the oldest slot holds a request that + * we haven't received a response for yet, we have to wait for the + * response to that before we can continue. We might not have even + * flushed the request to the pageserver yet, it might be just sitting + * in the output buffer. In that case, we flush it and wait for the + * response. (We could decide not to send it, but it's hard to abort + * when the request is already in the output buffer, and 'not sending' + * a prefetch request kind of goes against the principles of + * prefetching) + */ + if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused) + { + uint64 cleanup_index = MyPState->ring_last; + + slot = GetPrfSlot(cleanup_index); + + Assert(slot->status != PRFS_UNUSED); + + /* + * If there is good reason to run compaction on the prefetch buffers, + * try to do that. + */ + if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) + { + Assert(slot->status == PRFS_UNUSED); + } + else + { + /* + * We have the slot for ring_last, so that must still be in + * progress + */ + switch (slot->status) + { + case PRFS_REQUESTED: + Assert(MyPState->ring_receive == cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; + prefetch_set_unused(cleanup_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; + break; + case PRFS_RECEIVED: + case PRFS_TAG_REMAINS: + prefetch_set_unused(cleanup_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; + break; + default: + pg_unreachable(); + } + } + } + + /* + * The next buffer pointed to by `ring_unused` is now definitely empty, so + * we can insert the new request to it. + */ + ring_index = MyPState->ring_unused; + + Assert(MyPState->ring_last <= ring_index && + ring_index <= MyPState->ring_unused); + + slot = GetPrfSlotNoCheck(ring_index); + + Assert(slot->status == PRFS_UNUSED); + + /* + * We must update the slot data before insertion, because the hash + * function reads the buffer tag from the slot. + */ + slot->buftag = hashkey.buftag; + slot->shard_no = get_shard_number(&tag); + slot->my_ring_index = ring_index; + slot->flags = 0; + + min_ring_index = Min(min_ring_index, ring_index); + + if (is_prefetch) + MyNeonCounters->getpage_prefetch_requests_total++; + else + MyNeonCounters->getpage_sync_requests_total++; + + prefetch_do_request(slot, lsns); + } + + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + + Assert(any_hits); + + Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || + GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED); + Assert(MyPState->ring_last <= min_ring_index && + min_ring_index < MyPState->ring_unused); + + if (flush_every_n_requests > 0 && + MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) + { + if (!prefetch_flush_requests()) + { + /* + * Prefetch set is reset in case of error, so we should try to + * register our request once again + */ + goto Retry; + } + MyPState->ring_flush = MyPState->ring_unused; + } + + return min_ring_index; +} + +static bool +equal_requests(NeonRequest* a, NeonRequest* b) +{ + return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since; +} + + +/* + * Note: this function can get canceled and use a long jump to the next catch + * context. Take care. + */ +static NeonResponse * +page_server_request(void const *req) +{ + NeonResponse *resp; + BufferTag tag = {0}; + shardno_t shard_no; + + switch (messageTag(req)) + { + case T_NeonExistsRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo); + break; + case T_NeonNblocksRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo); + break; + case T_NeonDbSizeRequest: + NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode; + break; + case T_NeonGetPageRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo); + tag.blockNum = ((NeonGetPageRequest *) req)->blkno; + break; + default: + neon_log(ERROR, "Unexpected request tag: %d", messageTag(req)); + } + shard_no = get_shard_number(&tag); + + /* + * Current sharding model assumes that all metadata is present only at shard 0. + * We still need to call get_shard_no() to check if shard map is up-to-date. + */ + if (((NeonRequest *) req)->tag != T_NeonGetPageRequest) + { + shard_no = 0; + } + + do + { + PG_TRY(); + { + while (!page_server->send(shard_no, (NeonRequest *) req) + || !page_server->flush(shard_no)) + { + /* do nothing */ + } + MyNeonCounters->pageserver_open_requests++; + consume_prefetch_responses(); + resp = page_server->receive(shard_no); + MyNeonCounters->pageserver_open_requests--; + } + PG_CATCH(); + { + /* + * Cancellation in this code needs to be handled better at some + * point, but this currently seems fine for now. + */ + page_server->disconnect(shard_no); + MyNeonCounters->pageserver_open_requests = 0; + + /* + * We know for sure we're not working on any prefetch pages after + * this. + */ + END_PREFETCH_RECEIVE_WORK(); + + PG_RE_THROW(); + } + PG_END_TRY(); + + } while (resp == NULL); + + return resp; +} + + +StringInfoData +nm_pack_request(NeonRequest *msg) +{ + StringInfoData s; + + initStringInfo(&s); + + pq_sendbyte(&s, msg->tag); + if (neon_protocol_version >= 3) + { + pq_sendint64(&s, msg->reqid); + } + pq_sendint64(&s, msg->lsn); + pq_sendint64(&s, msg->not_modified_since); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_NeonExistsRequest: + { + NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; + + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_NeonNblocksRequest: + { + NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; + + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_NeonDbSizeRequest: + { + NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; + + pq_sendint32(&s, msg_req->dbNode); + + break; + } + case T_NeonGetPageRequest: + { + NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; + + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); + pq_sendbyte(&s, msg_req->forknum); + pq_sendint32(&s, msg_req->blkno); + + break; + } + + case T_NeonGetSlruSegmentRequest: + { + NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; + + pq_sendbyte(&s, msg_req->kind); + pq_sendint32(&s, msg_req->segno); + + break; + } + + /* pagestore -> pagestore_client. We never need to create these. */ + case T_NeonExistsResponse: + case T_NeonNblocksResponse: + case T_NeonGetPageResponse: + case T_NeonErrorResponse: + case T_NeonDbSizeResponse: + case T_NeonGetSlruSegmentResponse: + default: + neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag); + break; + } + return s; +} + +NeonResponse * +nm_unpack_response(StringInfo s) +{ + NeonMessageTag tag = pq_getmsgbyte(s); + NeonResponse resp_hdr = {0}; /* make valgrind happy */ + NeonResponse *resp = NULL; + + resp_hdr.tag = tag; + if (neon_protocol_version >= 3) + { + resp_hdr.reqid = pq_getmsgint64(s); + resp_hdr.lsn = pq_getmsgint64(s); + resp_hdr.not_modified_since = pq_getmsgint64(s); + } + switch (tag) + { + /* pagestore -> pagestore_client */ + case T_NeonExistsResponse: + { + NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse)); + + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + } + msg_resp->req.hdr = resp_hdr; + msg_resp->exists = pq_getmsgbyte(s); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonNblocksResponse: + { + NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse)); + + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + } + msg_resp->req.hdr = resp_hdr; + msg_resp->n_blocks = pq_getmsgint(s, 4); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonGetPageResponse: + { + NeonGetPageResponse *msg_resp; + + msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE); + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + msg_resp->req.blkno = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; + /* XXX: should be varlena */ + memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); + pq_getmsgend(s); + + Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonDbSizeResponse: + { + NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse)); + + if (neon_protocol_version >= 3) + { + msg_resp->req.dbNode = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; + msg_resp->db_size = pq_getmsgint64(s); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonErrorResponse: + { + NeonErrorResponse *msg_resp; + size_t msglen; + const char *msgtext; + + msgtext = pq_getmsgrawstring(s); + msglen = strlen(msgtext); + + msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1); + msg_resp->req = resp_hdr; + memcpy(msg_resp->message, msgtext, msglen + 1); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse *msg_resp; + int n_blocks; + msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse)); + + if (neon_protocol_version >= 3) + { + msg_resp->req.kind = pq_getmsgbyte(s); + msg_resp->req.segno = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; + + n_blocks = pq_getmsgint(s, 4); + msg_resp->n_blocks = n_blocks; + memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + /* + * pagestore_client -> pagestore + * + * We create these ourselves, and don't need to decode them. + */ + case T_NeonExistsRequest: + case T_NeonNblocksRequest: + case T_NeonGetPageRequest: + case T_NeonDbSizeRequest: + case T_NeonGetSlruSegmentRequest: + default: + neon_log(ERROR, "unexpected neon message tag 0x%02x", tag); + break; + } + + return resp; +} + +/* dump to json for debugging / error reporting purposes */ +char * +nm_to_string(NeonMessage *msg) +{ + StringInfoData s; + + initStringInfo(&s); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_NeonExistsRequest: + { + NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + + case T_NeonNblocksRequest: + { + NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + + case T_NeonGetPageRequest: + { + NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\""); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonDbSizeRequest: + { + NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); + appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonGetSlruSegmentRequest: + { + NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\""); + appendStringInfo(&s, ", \"kind\": %u", msg_req->kind); + appendStringInfo(&s, ", \"segno\": %u", msg_req->segno); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + /* pagestore -> pagestore_client */ + case T_NeonExistsResponse: + { + NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\""); + appendStringInfo(&s, ", \"exists\": %d}", + msg_resp->exists); + appendStringInfoChar(&s, '}'); + + break; + } + case T_NeonNblocksResponse: + { + NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\""); + appendStringInfo(&s, ", \"n_blocks\": %u}", + msg_resp->n_blocks); + appendStringInfoChar(&s, '}'); + + break; + } + case T_NeonGetPageResponse: + { +#if 0 + NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg; +#endif + + appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\""); + appendStringInfo(&s, ", \"page\": \"XXX\"}"); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonErrorResponse: + { + NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg; + + /* FIXME: escape double-quotes in the message */ + appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\""); + appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonDbSizeResponse: + { + NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\""); + appendStringInfo(&s, ", \"db_size\": %ld}", + msg_resp->db_size); + appendStringInfoChar(&s, '}'); + + break; + } + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\""); + appendStringInfo(&s, ", \"n_blocks\": %u}", + msg_resp->n_blocks); + appendStringInfoChar(&s, '}'); + + break; + } + + default: + appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag); + } + return s.data; +} + +/* + * communicator_init() -- Initialize per-backend private state + */ +void +communicator_init(void) +{ + Size prfs_size; + + if (MyPState != NULL) + return; + + /* + * Sanity check that theperf counters array is sized correctly. We got + * this wrong once, and the formula for max number of backends and aux + * processes might well change in the future, so better safe than sorry. + * This is a very cheap check so we do it even without assertions. On + * v14, this gets called before initializing MyProc, so we cannot perform + * the check here. That's OK, we don't expect the logic to change in old + * releases. + */ +#if PG_VERSION_NUM>=150000 + if (MyNeonCounters >= &neon_per_backend_counters_shared[NUM_NEON_PERF_COUNTER_SLOTS]) + elog(ERROR, "MyNeonCounters points past end of array"); +#endif + + prfs_size = offsetof(PrefetchState, prf_buffer) + + sizeof(PrefetchRequest) * readahead_buffer_size; + + MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size); + + MyPState->n_unused = readahead_buffer_size; + + MyPState->bufctx = SlabContextCreate(TopMemoryContext, + "NeonSMGR/prefetch", + SLAB_DEFAULT_BLOCK_SIZE * 17, + PS_GETPAGERESPONSE_SIZE); + MyPState->errctx = AllocSetContextCreate(TopMemoryContext, + "NeonSMGR/errors", + ALLOCSET_DEFAULT_SIZES); + MyPState->hashctx = AllocSetContextCreate(TopMemoryContext, + "NeonSMGR/prefetch", + ALLOCSET_DEFAULT_SIZES); + + MyPState->prf_hash = prfh_create(MyPState->hashctx, + readahead_buffer_size, NULL); +} + +/* + * neon_prefetch_response_usable -- Can a new request be satisfied by old one? + * + * This is used to check if the response to a prefetch request can be used to + * satisfy a page read now. + */ +static bool +neon_prefetch_response_usable(neon_request_lsns *request_lsns, + PrefetchRequest *slot) +{ + /* sanity check the LSN's on the old and the new request */ + Assert(request_lsns->request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn); + Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); + Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); + Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); + Assert(slot->status != PRFS_UNUSED); + + /* + * The new request's LSN should never be older than the old one. This + * could be an Assert, except that for testing purposes, we do provide an + * interface in neon_test_utils to fetch pages at arbitary LSNs, which + * violates this. + * + * Similarly, the not_modified_since value calculated for a page should + * never move backwards. This assumption is a bit fragile; if we updated + * the last-written cache when we read in a page, for example, then it + * might. But as the code stands, it should not. + * + * (If two backends issue a request at the same time, they might race and + * calculate LSNs "out of order" with each other, but the prefetch queue + * is backend-private at the moment.) + */ + if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn || + request_lsns->not_modified_since < slot->request_lsns.not_modified_since) + { + ereport(LOG, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "request with unexpected LSN after prefetch"), + errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", + LSN_FORMAT_ARGS(request_lsns->effective_request_lsn), + LSN_FORMAT_ARGS(request_lsns->not_modified_since), + LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), + LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); + return false; + } + + /*--- + * Each request to the pageserver has three LSN values associated with it: + * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'. + * `not_modified_since` and `request_lsn` are sent to the pageserver, but + * in the primary node, we always use UINT64_MAX as the `request_lsn`, so + * we remember `effective_request_lsn` separately. In a primary, + * `effective_request_lsn` is the same as `not_modified_since`. + * See comments in neon_get_request_lsns why we can not use last flush WAL position here. + * + * To determine whether a response to a GetPage request issued earlier is + * still valid to satisfy a new page read, we look at the + * (not_modified_since, effective_request_lsn] range of the request. It is + * effectively a claim that the page has not been modified between those + * LSNs. If the range of the old request in the queue overlaps with the + * new request, we know that the page hasn't been modified in the union of + * the ranges. We can use the response to old request to satisfy the new + * request in that case. For example: + * + * 100 500 + * Old request: +--------+ + * + * 400 800 + * New request: +--------+ + * + * The old request claims that the page was not modified between LSNs 100 + * and 500, and the second claims that it was not modified between 400 and + * 800. Together they mean that the page was not modified between 100 and + * 800. Therefore the response to the old request is also valid for the + * new request. + * + * This logic also holds at the boundary case that the old request's LSN + * matches the new request's not_modified_since LSN exactly: + * + * 100 500 + * Old request: +--------+ + * + * 500 900 + * New request: +--------+ + * + * The response to the old request is the page as it was at LSN 500, and + * the page hasn't been changed in the range (500, 900], therefore the + * response is valid also for the new request. + */ + + /* this follows from the checks above */ + Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since); + + return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn; +} + +/* + * Does the physical file exist? + */ +bool +communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *request_lsns) +{ + bool exists; + NeonResponse *resp; + + { + NeonExistsRequest request = { + .hdr.tag = T_NeonExistsRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns->request_lsn, + .hdr.not_modified_since = request_lsns->not_modified_since, + .rinfo = rinfo, + .forknum = forkNum + }; + + resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonExistsResponse: + { + NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + !RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) || + exists_resp->req.forknum != request.forknum) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum); + } + } + exists = exists_resp->exists; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", + resp->reqid, + RelFileInfoFmt(rinfo), + forkNum, + LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", + T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); + } + pfree(resp); + } + return exists; +} + +/* + * Read N pages at a specific LSN. + * + * *mask is set for pages read at a previous point in time, and which we + * should not touch, nor overwrite. + * New bits should be set in *mask for the pages we'successfully read. + * + * The offsets in request_lsns, buffers, and mask are linked. + */ +void +communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, + neon_request_lsns *request_lsns, + void **buffers, BlockNumber nblocks, const bits8 *mask) +{ + NeonResponse *resp; + uint64 ring_index; + PrfHashEntry *entry; + PrefetchRequest *slot; + PrefetchRequest hashkey; + + Assert(PointerIsValid(request_lsns)); + Assert(nblocks >= 1); + + /* + * Use an intermediate PrefetchRequest struct as the hash key to ensure + * correct alignment and that the padding bytes are cleared. + */ + memset(&hashkey.buftag, 0, sizeof(BufferTag)); + CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); + hashkey.buftag.forkNum = forkNum; + hashkey.buftag.blockNum = base_blockno; + + /* + * The redo process does not lock pages that it needs to replay but are + * not in the shared buffers, so a concurrent process may request the page + * after redo has decided it won't redo that page and updated the LwLSN + * for that page. If we're in hot standby we need to take care that we + * don't return until after REDO has finished replaying up to that LwLSN, + * as the page should have been locked up to that point. + * + * See also the description on neon_redo_read_buffer_filter below. + * + * NOTE: It is possible that the WAL redo process will still do IO due to + * concurrent failed read IOs. Those IOs should never have a request_lsn + * that is as large as the WAL record we're currently replaying, if it + * weren't for the behaviour of the LwLsn cache that uses the highest + * value of the LwLsn cache when the entry is not found. + */ + (void) prefetch_register_bufferv(hashkey.buftag, request_lsns, nblocks, mask, false); + + for (int i = 0; i < nblocks; i++) + { + void *buffer = buffers[i]; + BlockNumber blockno = base_blockno + i; + neon_request_lsns *reqlsns = &request_lsns[i]; + TimestampTz start_ts, end_ts; + + if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) + continue; + + start_ts = GetCurrentTimestamp(); + + if (RecoveryInProgress() && MyBackendType != B_STARTUP) + XLogWaitForReplayOf(reqlsns->request_lsn); + + /* + * Try to find prefetched page in the list of received pages. + */ +Retry: + hashkey.buftag.blockNum = blockno; + entry = prfh_lookup(MyPState->prf_hash, &hashkey); + + if (entry != NULL) + { + slot = entry->slot; + if (neon_prefetch_response_usable(reqlsns, slot)) + { + ring_index = slot->my_ring_index; + } + else + { + /* + * Cannot use this prefetch, discard it + * + * We can't drop cache for not-yet-received requested items. It is + * unlikely this happens, but it can happen if prefetch distance + * is large enough and a backend didn't consume all prefetch + * requests. + */ + if (slot->status == PRFS_REQUESTED) + { + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; + } + /* drop caches */ + prefetch_set_unused(slot->my_ring_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total++; + /* make it look like a prefetch cache miss */ + entry = NULL; + } + } + + do + { + if (entry == NULL) + { + ring_index = prefetch_register_bufferv(hashkey.buftag, reqlsns, 1, NULL, false); + Assert(ring_index != UINT64_MAX); + slot = GetPrfSlot(ring_index); + } + else + { + /* + * Empty our reference to the prefetch buffer's hash entry. When + * we wait for prefetches, the entry reference is invalidated by + * potential updates to the hash, and when we reconnect to the + * pageserver the prefetch we're waiting for may be dropped, in + * which case we need to retry and take the branch above. + */ + entry = NULL; + } + + Assert(slot->my_ring_index == ring_index); + Assert(MyPState->ring_last <= ring_index && + MyPState->ring_unused > ring_index); + Assert(slot->status != PRFS_UNUSED); + Assert(GetPrfSlot(ring_index) == slot); + + } while (!prefetch_wait_for(ring_index)); + + Assert(slot->status == PRFS_RECEIVED); + Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0); + Assert(hashkey.buftag.blockNum == base_blockno + i); + + resp = slot->response; + + switch (resp->tag) + { + case T_NeonGetPageResponse: + { + NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp; + if (neon_protocol_version >= 3) + { + if (resp->reqid != slot->reqid || + resp->lsn != slot->request_lsns.request_lsn || + resp->not_modified_since != slot->request_lsns.not_modified_since || + !RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) || + getpage_resp->req.forknum != forkNum || + getpage_resp->req.blkno != base_blockno + i) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno, + slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i); + } + } + memcpy(buffer, getpage_resp->page, BLCKSZ); + + /* + * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received + * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here + * under buffer lock. + */ + if (!lfc_store_prefetch_result) + lfc_write(rinfo, forkNum, blockno, buffer); + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (resp->reqid != slot->reqid || + resp->lsn != slot->request_lsns.request_lsn || + resp->not_modified_since != slot->request_lsns.not_modified_since) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo), + forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + default: + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); + } + + /* buffer was used, clean up for later reuse */ + prefetch_set_unused(ring_index); + prefetch_cleanup_trailing_unused(); + + end_ts = GetCurrentTimestamp(); + inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0); + } +} + +/* + * neon_nblocks() -- Get the number of blocks stored in a relation. + */ +BlockNumber +communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *request_lsns) +{ + NeonResponse *resp; + BlockNumber n_blocks; + + { + NeonNblocksRequest request = { + .hdr.tag = T_NeonNblocksRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns->request_lsn, + .hdr.not_modified_since = request_lsns->not_modified_since, + .rinfo = rinfo, + .forknum = forknum, + }; + + resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonNblocksResponse: + { + NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + !RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) || + relsize_resp->req.forknum != forknum) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum); + } + } + n_blocks = relsize_resp->n_blocks; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", + resp->reqid, + RelFileInfoFmt(rinfo), + forknum, + LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", + T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); + } + + pfree(resp); + } + return n_blocks; +} + +/* + * neon_db_size() -- Get the size of the database in bytes. + */ +int64 +communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns) +{ + NeonResponse *resp; + int64 db_size; + + { + NeonDbSizeRequest request = { + .hdr.tag = T_NeonDbSizeRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns->request_lsn, + .hdr.not_modified_since = request_lsns->not_modified_since, + .dbNode = dbNode, + }; + + resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonDbSizeResponse: + { + NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + dbsize_resp->req.dbNode != dbNode) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode); + } + } + db_size = dbsize_resp->db_size; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X", + resp->reqid, + dbNode, LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", + T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); + } + + pfree(resp); + } + return db_size; +} + +int +communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *request_lsns, + void *buffer) +{ + int n_blocks; + shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ + NeonResponse *resp; + NeonGetSlruSegmentRequest request; + + request = (NeonGetSlruSegmentRequest) { + .hdr.tag = T_NeonGetSlruSegmentRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns->request_lsn, + .hdr.not_modified_since = request_lsns->not_modified_since, + .kind = kind, + .segno = segno + }; + + do + { + while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no)); + + consume_prefetch_responses(); + + resp = page_server->receive(shard_no); + } while (resp == NULL); + + switch (resp->tag) + { + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + slru_resp->req.kind != kind || + slru_resp->req.segno != segno) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno); + } + } + n_blocks = slru_resp->n_blocks; + memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ); + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %llu at lsn %X/%08X", + resp->reqid, + kind, + (unsigned long long) segno, + LSN_FORMAT_ARGS(request_lsns->request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x", + T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag); + } + pfree(resp); + + communicator_reconfigure_timeout_if_needed(); + return n_blocks; +} + +void +communicator_reconfigure_timeout_if_needed(void) +{ + bool needs_set = MyPState->ring_receive != MyPState->ring_unused && + readahead_getpage_pull_timeout_ms > 0; + + if (needs_set != timeout_set) + { + /* The background writer doens't (shouldn't) read any pages */ + Assert(!AmBackgroundWriterProcess()); + /* The checkpointer doens't (shouldn't) read any pages */ + Assert(!AmCheckpointerProcess()); + + if (unlikely(PS_TIMEOUT_ID == 0)) + { + PS_TIMEOUT_ID = RegisterTimeout(USER_TIMEOUT, pagestore_timeout_handler); + } + + if (needs_set) + { +#if PG_MAJORVERSION_NUM <= 14 + enable_timeout_after(PS_TIMEOUT_ID, readahead_getpage_pull_timeout_ms); +#else + enable_timeout_every( + PS_TIMEOUT_ID, + TimestampTzPlusMilliseconds(GetCurrentTimestamp(), + readahead_getpage_pull_timeout_ms), + readahead_getpage_pull_timeout_ms + ); +#endif + timeout_set = true; + } + else + { + Assert(timeout_set); + disable_timeout(PS_TIMEOUT_ID, false); + timeout_set = false; + } + } +} + +static void +pagestore_timeout_handler(void) +{ +#if PG_MAJORVERSION_NUM <= 14 + /* + * PG14: Setting a repeating timeout is not possible, so we signal here + * that the timeout has already been reset, and by telling the system + * that system will re-schedule it later if we need to. + */ + timeout_set = false; +#endif + timeout_signaled = true; + InterruptPending = true; +} + +/* + * Process new data received in our active PageStream sockets. + * + * This relies on the invariant that all pipelined yet-to-be-received requests + * are getPage requests managed by MyPState. This is currently true, any + * modification will probably require some stuff to make it work again. + */ +static bool +communicator_processinterrupts(void) +{ + if (timeout_signaled) + { + if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0) + communicator_prefetch_pump_state(true); + + timeout_signaled = false; + communicator_reconfigure_timeout_if_needed(); + } + + if (!prev_interrupt_cb) + return false; + + return prev_interrupt_cb(); +} diff --git a/pgxn/neon/communicator.h b/pgxn/neon/communicator.h new file mode 100644 index 0000000000..72cba526c1 --- /dev/null +++ b/pgxn/neon/communicator.h @@ -0,0 +1,48 @@ +/*------------------------------------------------------------------------- + * + * communicator.h + * internal interface for communicating with remote pageservers + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#ifndef COMMUNICATOR_h +#define COMMUNICATOR_h + +#include "neon_pgversioncompat.h" + +#include "storage/buf_internals.h" + +#include "pagestore_client.h" + +/* initialization at postmaster startup */ +extern void pg_init_communicator(void); + +/* initialization at backend startup */ +extern void communicator_init(void); + +extern bool communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, + neon_request_lsns *request_lsns); +extern BlockNumber communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, + neon_request_lsns *request_lsns); +extern int64 communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns); +extern void communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber base_blockno, neon_request_lsns *request_lsns, + void **buffers, BlockNumber nblocks, const bits8 *mask); +extern int communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, + neon_request_lsns *lsns, + BlockNumber nblocks, void **buffers, bits8 *mask); +extern void communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask); +extern int communicator_read_slru_segment(SlruKind kind, int64 segno, + neon_request_lsns *request_lsns, + void *buffer); + +extern void communicator_reconfigure_timeout_if_needed(void); +extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts); + + +#endif diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index 59096a1bc8..47ed37da06 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -13,9 +13,6 @@ * accumulate changes. On subtransaction commit, the top of the stack * is merged with the table below it. * - * IDENTIFICATION - * contrib/neon/control_plane_connector.c - * *------------------------------------------------------------------------- */ diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c index 0331f961b4..00dcb6920e 100644 --- a/pgxn/neon/extension_server.c +++ b/pgxn/neon/extension_server.c @@ -3,9 +3,6 @@ * extension_server.c * Request compute_ctl to download extension files. * - * IDENTIFICATION - * contrib/neon/extension_server.c - * *------------------------------------------------------------------------- */ #include "postgres.h" diff --git a/pgxn/neon/extension_server.h b/pgxn/neon/extension_server.h index 3e67708b85..8356d70959 100644 --- a/pgxn/neon/extension_server.h +++ b/pgxn/neon/extension_server.h @@ -3,9 +3,6 @@ * extension_server.h * Request compute_ctl to download extension files. * - * IDENTIFICATION - * contrib/neon/extension_server.h - * *------------------------------------------------------------------------- */ diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index e555e069d0..8c2990e57a 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -1,4 +1,4 @@ -/* +/*------------------------------------------------------------------------- * * file_cache.c * @@ -6,10 +6,6 @@ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * - * IDENTIFICATION - * pgxn/neon/file_cache.c - * *------------------------------------------------------------------------- */ @@ -25,7 +21,6 @@ #include "access/xlog.h" #include "funcapi.h" #include "miscadmin.h" -#include "pagestore_client.h" #include "common/hashfn.h" #include "pgstat.h" #include "port/pg_iovec.h" @@ -47,6 +42,7 @@ #include "hll.h" #include "bitmap.h" +#include "file_cache.h" #include "neon.h" #include "neon_lwlsncache.h" #include "neon_perf_counters.h" @@ -647,18 +643,25 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, return found; } +#if PG_MAJORVERSION_NUM >= 16 +static PGIOAlignedBlock voidblock = {0}; +#else +static PGAlignedBlock voidblock = {0}; +#endif +#define SCRIBBLEPAGE (&voidblock.data) + /* * Try to read pages from local cache. * Returns the number of pages read from the local cache, and sets bits in - * 'read' for the pages which were read. This may scribble over buffers not - * marked in 'read', so be careful with operation ordering. + * 'mask' for the pages which were read. This may scribble over buffers not + * marked in 'mask', so be careful with operation ordering. * * In case of error local file cache is disabled (lfc->limit is set to zero), - * and -1 is returned. Note that 'read' and the buffers may be touched and in - * an otherwise invalid state. + * and -1 is returned. * - * If the mask argument is supplied, bits will be set at the offsets of pages - * that were present and read from the LFC. + * If the mask argument is supplied, we'll only try to read those pages which + * don't have their bits set on entry. At exit, pages which were successfully + * read from LFC will have their bits set. */ int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, @@ -693,23 +696,43 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, while (nblocks > 0) { struct iovec iov[PG_IOV_MAX]; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + int8 chunk_mask[BLOCKS_PER_CHUNK / 8] = {0}; + int chunk_offs = (blkno & (BLOCKS_PER_CHUNK - 1)); int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); int iteration_hits = 0; int iteration_misses = 0; uint64 io_time_us = 0; - int n_blocks_to_read = 0; + int n_blocks_to_read = 0; + int iov_last_used = 0; + int first_block_in_chunk_read = -1; ConditionVariable* cv; Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) { - n_blocks_to_read += (BITMAP_ISSET(mask, buf_offset + i) != 0); - iov[i].iov_base = buffers[buf_offset + i]; iov[i].iov_len = BLCKSZ; - BITMAP_CLR(mask, buf_offset + i); + /* mask not set = we must do work */ + if (!BITMAP_ISSET(mask, buf_offset + i)) + { + iov[i].iov_base = buffers[buf_offset + i]; + n_blocks_to_read++; + iov_last_used = i + 1; + + if (first_block_in_chunk_read == -1) + { + first_block_in_chunk_read = i; + } + } + /* mask set = we must do no work */ + else + { + /* don't scribble on pages we weren't requested to write to */ + iov[i].iov_base = SCRIBBLEPAGE; + } } + + /* shortcut IO */ if (n_blocks_to_read == 0) { buf_offset += blocks_in_chunk; @@ -718,6 +741,12 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, continue; } + /* + * The effective iov size must be >= the number of blocks we're about + * to read. + */ + Assert(iov_last_used - first_block_in_chunk_read >= n_blocks_to_read); + tag.blockNum = blkno - chunk_offs; hash = get_hash_value(lfc_hash, &tag); cv = &lfc_ctl->cv[hash % N_COND_VARS]; @@ -762,10 +791,15 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, generation = lfc_ctl->generation; entry_offset = entry->offset; - for (int i = 0; i < blocks_in_chunk; i++) + for (int i = first_block_in_chunk_read; i < iov_last_used; i++) { FileCacheBlockState state = UNAVAILABLE; bool sleeping = false; + + /* no need to work on something we're not interested in */ + if (BITMAP_ISSET(mask, buf_offset + i)) + continue; + while (lfc_ctl->generation == generation) { state = GET_STATE(entry, chunk_offs + i); @@ -789,7 +823,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, } if (state == AVAILABLE) { - BITMAP_SET(mask, buf_offset + i); + BITMAP_SET(chunk_mask, i); iteration_hits++; } else @@ -801,16 +835,34 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (iteration_hits != 0) { + /* chunk offset (# of pages) into the LFC file */ + off_t first_read_offset = (off_t) entry_offset * BLOCKS_PER_CHUNK; + int nwrite = iov_last_used - first_block_in_chunk_read; + /* offset of first IOV */ + first_read_offset += chunk_offs + first_block_in_chunk_read; + pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ); - rc = preadv(lfc_desc, iov, blocks_in_chunk, - ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + + /* Read only the blocks we're interested in, limiting */ + rc = preadv(lfc_desc, &iov[first_block_in_chunk_read], + nwrite, first_read_offset * BLCKSZ); pgstat_report_wait_end(); - if (rc != (BLCKSZ * blocks_in_chunk)) + if (rc != (BLCKSZ * nwrite)) { lfc_disable("read"); return -1; } + + /* + * We successfully read the pages we know were valid when we + * started reading; now mark those pages as read + */ + for (int i = first_block_in_chunk_read; i < iov_last_used; i++) + { + if (BITMAP_ISSET(chunk_mask, i)) + BITMAP_SET(mask, buf_offset + i); + } } /* Place entry to the head of LRU list */ @@ -1511,8 +1563,12 @@ local_cache_pages(PG_FUNCTION_ARGS) hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { - for (int i = 0; i < BLOCKS_PER_CHUNK; i++) - n_pages += GET_STATE(entry, i) == AVAILABLE; + /* Skip hole tags */ + if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0) + { + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + n_pages += GET_STATE(entry, i) == AVAILABLE; + } } } } @@ -1540,16 +1596,19 @@ local_cache_pages(PG_FUNCTION_ARGS) { for (int i = 0; i < BLOCKS_PER_CHUNK; i++) { - if (GET_STATE(entry, i) == AVAILABLE) + if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0) { - fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i; - fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)); - fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key)); - fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key)); - fctx->record[n].forknum = entry->key.forkNum; - fctx->record[n].blocknum = entry->key.blockNum + i; - fctx->record[n].accesscount = entry->access_count; - n += 1; + if (GET_STATE(entry, i) == AVAILABLE) + { + fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i; + fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)); + fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key)); + fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key)); + fctx->record[n].forknum = entry->key.forkNum; + fctx->record[n].blocknum = entry->key.blockNum + i; + fctx->record[n].accesscount = entry->access_count; + n += 1; + } } } } diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h new file mode 100644 index 0000000000..849558b83d --- /dev/null +++ b/pgxn/neon/file_cache.h @@ -0,0 +1,52 @@ +/*------------------------------------------------------------------------- + * + * file_cache.h + * Local File Cache definitions + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#ifndef FILE_CACHE_h +#define FILE_CACHE_h + +#include "neon_pgversioncompat.h" + +/* GUCs */ +extern bool lfc_store_prefetch_result; + +/* functions for local file cache */ +extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, const void *const *buffers, + BlockNumber nblocks); +/* returns number of blocks read, with one bit set in *read for each */ +extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, void **buffers, + BlockNumber nblocks, bits8 *mask); + +extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno); +extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, int nblocks, bits8 *bitmap); +extern void lfc_init(void); +extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + const void* buffer, XLogRecPtr lsn); + + +static inline bool +lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + void *buffer) +{ + bits8 rv = 0; + return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1; +} + +static inline void +lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + const void *buffer) +{ + return lfc_writev(rinfo, forkNum, blkno, &buffer, 1); +} + +#endif /* FILE_CACHE_H */ diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 20f4d462c0..dfabb6919e 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -6,10 +6,6 @@ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * - * IDENTIFICATION - * contrib/neon/libpqpagestore.c - * *------------------------------------------------------------------------- */ #include "postgres.h" @@ -34,6 +30,7 @@ #include "storage/lwlock.h" #include "storage/pg_shmem.h" #include "utils/guc.h" +#include "utils/memutils.h" #include "neon.h" #include "neon_perf_counters.h" @@ -68,6 +65,9 @@ static const struct config_enum_entry neon_compute_modes[] = { /* GUCs */ char *neon_timeline; char *neon_tenant; +char *neon_project_id; +char *neon_branch_id; +char *neon_endpoint_id; int32 max_cluster_size; char *page_server_connstring; char *neon_auth_token; @@ -1142,37 +1142,23 @@ pageserver_try_receive(shardno_t shard_no) NeonResponse *resp; PageServer *shard = &page_servers[shard_no]; PGconn *pageserver_conn = shard->conn; - /* read response */ - int rc; + int rc; if (shard->state != PS_Connected) return NULL; Assert(pageserver_conn); - while (true) + rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async */); + if (rc == 0) { - if (PQisBusy(shard->conn)) + if (!PQconsumeInput(shard->conn)) { - WaitEvent event; - if (WaitEventSetWait(shard->wes_read, 0, &event, 1, - WAIT_EVENT_NEON_PS_READ) != 1 - || (event.events & WL_SOCKET_READABLE) == 0) - { - return NULL; - } + return NULL; } rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async */); - if (rc == 0) - { - if (!PQconsumeInput(shard->conn)) - { - return NULL; - } - } - else - break; } + if (rc == 0) return NULL; else if (rc > 0) @@ -1369,6 +1355,31 @@ pg_init_libpagestore(void) 0, /* no flags required */ check_neon_id, NULL, NULL); + DefineCustomStringVariable("neon.project_id", + "Neon project_id the server is running on", + NULL, + &neon_project_id, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_neon_id, NULL, NULL); + DefineCustomStringVariable("neon.branch_id", + "Neon branch_id the server is running on", + NULL, + &neon_branch_id, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_neon_id, NULL, NULL); + DefineCustomStringVariable("neon.endpoint_id", + "Neon endpoint_id the server is running on", + NULL, + &neon_endpoint_id, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_neon_id, NULL, NULL); + DefineCustomIntVariable("neon.stripe_size", "sharding stripe size", NULL, @@ -1492,6 +1503,4 @@ pg_init_libpagestore(void) } memset(page_servers, 0, sizeof(page_servers)); - - lfc_init(); } diff --git a/pgxn/neon/logical_replication_monitor.c b/pgxn/neon/logical_replication_monitor.c index b94faafdfa..69426c2e83 100644 --- a/pgxn/neon/logical_replication_monitor.c +++ b/pgxn/neon/logical_replication_monitor.c @@ -1,11 +1,11 @@ +#include "postgres.h" + #include #include #include #include #include -#include "postgres.h" - #include "miscadmin.h" #include "postmaster/bgworker.h" #include "postmaster/interrupt.h" diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index b738b5ebd1..a6a7021756 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -1,10 +1,7 @@ /*------------------------------------------------------------------------- * * neon.c - * Utility functions to expose neon specific information to user - * - * IDENTIFICATION - * contrib/neon/neon.c + * Main entry point into the neon exension * *------------------------------------------------------------------------- */ @@ -31,7 +28,9 @@ #include "utils/guc.h" #include "utils/guc_tables.h" +#include "communicator.h" #include "extension_server.h" +#include "file_cache.h" #include "neon.h" #include "neon_lwlsncache.h" #include "control_plane_connector.h" @@ -437,10 +436,11 @@ _PG_init(void) #endif pg_init_libpagestore(); + lfc_init(); pg_init_walproposer(); init_lwlsncache(); - pagestore_smgr_init(); + pg_init_communicator(); Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitUnstableExtensionsSupport(); diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index c9beb8c318..a2e81feb5f 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -3,15 +3,13 @@ * neon.h * Functions used in the initialization of this extension. * - * IDENTIFICATION - * contrib/neon/neon.h - * *------------------------------------------------------------------------- */ #ifndef NEON_H #define NEON_H -#include "access/xlogreader.h" + +#include "access/xlogdefs.h" #include "utils/wait_event.h" /* GUCs */ @@ -49,17 +47,26 @@ extern uint32 WAIT_EVENT_NEON_WAL_DL; #define WAIT_EVENT_NEON_WAL_DL WAIT_EVENT_WAL_READ #endif + +#define NEON_TAG "[NEON_SMGR] " +#define neon_log(tag, fmt, ...) ereport(tag, \ + (errmsg(NEON_TAG fmt, ##__VA_ARGS__), \ + errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) +#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag, \ + (errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \ + errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) + + extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); -extern void pagestore_smgr_init(void); extern uint64 BackpressureThrottlingTime(void); extern void SetNeonCurrentClusterSize(uint64 size); extern uint64 GetNeonCurrentClusterSize(void); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); -extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]); -extern void PGDLLEXPORT WalProposerMain(Datum main_arg); -PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg); +extern PGDLLEXPORT void WalProposerSync(int argc, char *argv[]); +extern PGDLLEXPORT void WalProposerMain(Datum main_arg); +extern PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg); #endif /* NEON_H */ diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 8edc658a30..5f5330bb69 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -12,8 +12,8 @@ #include "storage/procnumber.h" #else #include "storage/backendid.h" -#include "storage/proc.h" #endif +#include "storage/proc.h" static const uint64 io_wait_bucket_thresholds[] = { 2, 3, 6, 10, /* 0 us - 10 us */ diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index 5854a7ef0f..be2c4ddf79 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -20,6 +20,7 @@ #include "access/xlogreader.h" #include "libpq/pqformat.h" #include "storage/fd.h" +#include "utils/memutils.h" #include "utils/wait_event.h" #include "libpq-fe.h" diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 475697f9c0..0ab539fe56 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -8,8 +8,8 @@ * *------------------------------------------------------------------------- */ -#ifndef pageserver_h -#define pageserver_h +#ifndef PAGESTORE_CLIENT_h +#define PAGESTORE_CLIENT_h #include "neon_pgversioncompat.h" @@ -17,11 +17,8 @@ #include "access/xlogdefs.h" #include RELFILEINFO_HDR #include "lib/stringinfo.h" -#include "libpq/pqformat.h" #include "storage/block.h" #include "storage/buf_internals.h" -#include "storage/smgr.h" -#include "utils/memutils.h" #define MAX_SHARDS 128 #define MAX_PAGESERVER_CONNSTRING_SIZE 256 @@ -61,14 +58,6 @@ typedef struct #define messageTag(m) (((const NeonMessage *)(m))->tag) -#define NEON_TAG "[NEON_SMGR] " -#define neon_log(tag, fmt, ...) ereport(tag, \ - (errmsg(NEON_TAG fmt, ##__VA_ARGS__), \ - errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) -#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag, \ - (errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \ - errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) - /* SLRUs downloadable from page server */ typedef enum { SLRU_CLOG, @@ -237,7 +226,6 @@ extern char *neon_timeline; extern char *neon_tenant; extern int32 max_cluster_size; extern int neon_protocol_version; -extern bool lfc_store_prefetch_result; extern shardno_t get_shard_number(BufferTag* tag); @@ -245,6 +233,7 @@ extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_neon(void); extern void readahead_buffer_resize(int newsize, void *extra); + /* * LSN values associated with each request to the pageserver */ @@ -277,15 +266,14 @@ typedef struct XLogRecPtr effective_request_lsn; } neon_request_lsns; -#if PG_MAJORVERSION_NUM < 16 -extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, char *buffer); -#else extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer); -#endif extern int64 neon_dbsize(Oid dbNode); +extern void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, + BlockNumber blkno, neon_request_lsns *output, + BlockNumber nblocks); + /* utils for neon relsize cache */ extern void relsize_hash_init(void); extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size); @@ -293,37 +281,4 @@ extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumb extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size); extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum); -/* functions for local file cache */ -extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, - BlockNumber blkno, const void *const *buffers, - BlockNumber nblocks); -/* returns number of blocks read, with one bit set in *read for each */ -extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, - BlockNumber blkno, void **buffers, - BlockNumber nblocks, bits8 *mask); - -extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, - BlockNumber blkno); -extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, - BlockNumber blkno, int nblocks, bits8 *bitmap); -extern void lfc_init(void); -extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, - const void* buffer, XLogRecPtr lsn); - - -static inline bool -lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - void *buffer) -{ - bits8 rv = 1; - return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1; -} - -static inline void -lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - const void *buffer) -{ - return lfc_writev(rinfo, forkNum, blkno, &buffer, 1); -} - -#endif +#endif /* PAGESTORE_CLIENT_H */ diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index ddcee74ff3..ef6bd038bb 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -37,10 +37,6 @@ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * - * IDENTIFICATION - * contrib/neon/pagestore_smgr.c - * *------------------------------------------------------------------------- */ #include "postgres.h" @@ -53,8 +49,6 @@ #include "access/xlog_internal.h" #include "access/xlogutils.h" #include "catalog/pg_class.h" -#include "common/hashfn.h" -#include "executor/instrument.h" #include "pgstat.h" #include "postmaster/autovacuum.h" #include "postmaster/interrupt.h" @@ -65,9 +59,10 @@ #include "storage/fsm_internals.h" #include "storage/md.h" #include "storage/smgr.h" -#include "utils/timeout.h" #include "bitmap.h" +#include "communicator.h" +#include "file_cache.h" #include "neon.h" #include "neon_lwlsncache.h" #include "neon_perf_counters.h" @@ -104,12 +99,6 @@ static char *hexdump_page(char *page); const int SmgrTrace = DEBUG5; -#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \ - neon_shard_log(shard_no, elvl, "Broken connection state: " message, \ - ##__VA_ARGS__) - -page_server_api *page_server; - /* unlogged relation build states */ typedef enum { @@ -127,1682 +116,6 @@ static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); -static uint32 local_request_counter; -#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter) - -/* - * Various settings related to prompt (fast) handling of PageStream responses - * at any CHECK_FOR_INTERRUPTS point. - */ -int readahead_getpage_pull_timeout_ms = 0; -static int PS_TIMEOUT_ID = 0; -static bool timeout_set = false; -static bool timeout_signaled = false; - -/* - * We have a CHECK_FOR_INTERRUPTS in page_server->receive(), and we don't want - * that to handle any getpage responses if we're already working on the - * backlog of those, as we'd hit issues with determining which prefetch slot - * we just got a response for. - * - * To protect against that, we have this variable that's set whenever we start - * receiving data for prefetch slots, so that we don't get confused. - * - * Note that in certain error cases during readpage we may leak r_r_g=true, - * which results in a failure to pick up further responses until we first - * actively try to receive new getpage responses. - */ -static bool readpage_reentrant_guard = false; - -static void reconfigure_timeout_if_needed(void); -static void pagestore_timeout_handler(void); - -#define START_PREFETCH_RECEIVE_WORK() \ - do { \ - readpage_reentrant_guard = true; \ - } while (false) - -#define END_PREFETCH_RECEIVE_WORK() \ - do { \ - readpage_reentrant_guard = false; \ - if (unlikely(timeout_signaled && !InterruptPending)) \ - InterruptPending = true; \ - } while (false) - -/* - * Prefetch implementation: - * - * Prefetch is performed locally by each backend. - * - * There can be up to readahead_buffer_size active IO requests registered at - * any time. Requests using smgr_prefetch are sent to the pageserver, but we - * don't wait on the response. Requests using smgr_read are either read from - * the buffer, or (if that's not possible) we wait on the response to arrive - - * this also will allow us to receive other prefetched pages. - * Each request is immediately written to the output buffer of the pageserver - * connection, but may not be flushed if smgr_prefetch is used: pageserver - * flushes sent requests on manual flush, or every neon.flush_output_after - * unflushed requests; which is not necessarily always and all the time. - * - * Once we have received a response, this value will be stored in the response - * buffer, indexed in a hash table. This allows us to retain our buffered - * prefetch responses even when we have cache misses. - * - * Reading of prefetch responses is delayed until them are actually needed - * (smgr_read). In case of prefetch miss or any other SMGR request other than - * smgr_read, all prefetch responses in the pipeline will need to be read from - * the connection; the responses are stored for later use. - * - * NOTE: The current implementation of the prefetch system implements a ring - * buffer of up to readahead_buffer_size requests. If there are more _read and - * _prefetch requests between the initial _prefetch and the _read of a buffer, - * the prefetch request will have been dropped from this prefetch buffer, and - * your prefetch was wasted. - */ - -/* - * State machine: - * - * not in hash : in hash - * : - * UNUSED ------> REQUESTED --> RECEIVED - * ^ : | | - * | : v | - * | : TAG_REMAINS | - * | : | | - * +----------------+------------+ - * : - */ -typedef enum PrefetchStatus -{ - PRFS_UNUSED = 0, /* unused slot */ - PRFS_REQUESTED, /* request was written to the sendbuffer to - * PS, but not necessarily flushed. all fields - * except response valid */ - PRFS_RECEIVED, /* all fields valid */ - PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still - * valid */ -} PrefetchStatus; - -/* must fit in uint8; bits 0x1 are used */ -typedef enum { - PRFSF_NONE = 0x0, - PRFSF_LFC = 0x1 /* received prefetch result is stored in LFC */ -} PrefetchRequestFlags; - -typedef struct PrefetchRequest -{ - BufferTag buftag; /* must be first entry in the struct */ - shardno_t shard_no; - uint8 status; /* see PrefetchStatus for valid values */ - uint8 flags; /* see PrefetchRequestFlags */ - neon_request_lsns request_lsns; - NeonRequestId reqid; - NeonResponse *response; /* may be null */ - uint64 my_ring_index; -} PrefetchRequest; - -/* prefetch buffer lookup hash table */ - -typedef struct PrfHashEntry -{ - PrefetchRequest *slot; - uint32 status; - uint32 hash; -} PrfHashEntry; - -#define SH_PREFIX prfh -#define SH_ELEMENT_TYPE PrfHashEntry -#define SH_KEY_TYPE PrefetchRequest * -#define SH_KEY slot -#define SH_STORE_HASH -#define SH_GET_HASH(tb, a) ((a)->hash) -#define SH_HASH_KEY(tb, key) hash_bytes( \ - ((const unsigned char *) &(key)->buftag), \ - sizeof(BufferTag) \ -) - -#define SH_EQUAL(tb, a, b) (BufferTagsEqual(&(a)->buftag, &(b)->buftag)) -#define SH_SCOPE static inline -#define SH_DEFINE -#define SH_DECLARE -#include "lib/simplehash.h" - -/* - * PrefetchState maintains the state of (prefetch) getPage@LSN requests. - * It maintains a (ring) buffer of in-flight requests and responses. - * - * We maintain several indexes into the ring buffer: - * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0 - * - * ring_unused points to the first unused slot of the buffer - * ring_receive is the next request that is to be received - * ring_last is the oldest received entry in the buffer - * - * Apart from being an entry in the ring buffer of prefetch requests, each - * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag. - */ -typedef struct PrefetchState -{ - MemoryContext bufctx; /* context for prf_buffer[].response - * allocations */ - MemoryContext errctx; /* context for prf_buffer[].response - * allocations */ - MemoryContext hashctx; /* context for prf_buffer */ - - /* buffer indexes */ - uint64 ring_unused; /* first unused slot */ - uint64 ring_flush; /* next request to flush */ - uint64 ring_receive; /* next slot that is to receive a response */ - uint64 ring_last; /* min slot with a response value */ - - /* metrics / statistics */ - int n_responses_buffered; /* count of PS responses not yet in - * buffers */ - int n_requests_inflight; /* count of PS requests considered in - * flight */ - int n_unused; /* count of buffers < unused, > last, that are - * also unused */ - - /* the buffers */ - prfh_hash *prf_hash; - int max_shard_no; - /* Mark shards involved in prefetch */ - uint8 shard_bitmap[(MAX_SHARDS + 7)/8]; - PrefetchRequest prf_buffer[]; /* prefetch buffers */ -} PrefetchState; - -static PrefetchState *MyPState; - -#define GetPrfSlotNoCheck(ring_index) ( \ - &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ -) - -#define GetPrfSlot(ring_index) ( \ - ( \ - AssertMacro((ring_index) < MyPState->ring_unused && \ - (ring_index) >= MyPState->ring_last), \ - GetPrfSlotNoCheck(ring_index) \ - ) \ -) - -#define ReceiveBufferNeedsCompaction() (\ - (MyPState->n_responses_buffered / 8) < ( \ - MyPState->ring_receive - \ - MyPState->ring_last - \ - MyPState->n_responses_buffered \ - ) \ -) - -static bool compact_prefetch_buffers(void); -static void consume_prefetch_responses(void); -static bool prefetch_read(PrefetchRequest *slot); -static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns); -static bool prefetch_wait_for(uint64 ring_index); -static void prefetch_cleanup_trailing_unused(void); -static inline void prefetch_set_unused(uint64 ring_index); - -static void -neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, - BlockNumber blkno, neon_request_lsns *output, - BlockNumber nblocks); -static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, - PrefetchRequest *slot); - -static bool -compact_prefetch_buffers(void) -{ - uint64 empty_ring_index = MyPState->ring_last; - uint64 search_ring_index = MyPState->ring_receive; - int n_moved = 0; - - if (MyPState->ring_receive == MyPState->ring_last) - return false; - - while (search_ring_index > MyPState->ring_last) - { - search_ring_index--; - if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED) - { - empty_ring_index = search_ring_index; - break; - } - } - - /* - * Here we have established: slots < search_ring_index have an unknown - * state (not scanned) slots >= search_ring_index and <= empty_ring_index - * are unused slots > empty_ring_index are in use, or outside our buffer's - * range. ... unless search_ring_index <= ring_last - * - * Therefore, there is a gap of at least one unused items between - * search_ring_index and empty_ring_index (both inclusive), which grows as - * we hit more unused items while moving backwards through the array. - */ - - while (search_ring_index > MyPState->ring_last) - { - PrefetchRequest *source_slot; - PrefetchRequest *target_slot; - bool found; - - /* update search index to an unprocessed entry */ - search_ring_index--; - - source_slot = GetPrfSlot(search_ring_index); - - if (source_slot->status == PRFS_UNUSED) - continue; - - /* slot is used -- start moving slot */ - target_slot = GetPrfSlot(empty_ring_index); - - Assert(source_slot->status == PRFS_RECEIVED); - Assert(target_slot->status == PRFS_UNUSED); - - target_slot->buftag = source_slot->buftag; - target_slot->shard_no = source_slot->shard_no; - target_slot->status = source_slot->status; - target_slot->flags = source_slot->flags; - target_slot->response = source_slot->response; - target_slot->reqid = source_slot->reqid; - target_slot->request_lsns = source_slot->request_lsns; - target_slot->my_ring_index = empty_ring_index; - - prfh_delete(MyPState->prf_hash, source_slot); - prfh_insert(MyPState->prf_hash, target_slot, &found); - - Assert(!found); - - /* Adjust the location of our known-empty slot */ - empty_ring_index--; - - /* empty the moved slot */ - source_slot->status = PRFS_UNUSED; - source_slot->buftag = (BufferTag) - { - 0 - }; - source_slot->response = NULL; - source_slot->my_ring_index = 0; - source_slot->request_lsns = (neon_request_lsns) { - InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr - }; - - /* update bookkeeping */ - n_moved++; - } - - /* - * Only when we've moved slots we can expect trailing unused slots, so - * only then we clean up trailing unused slots. - */ - if (n_moved > 0) - { - prefetch_cleanup_trailing_unused(); - return true; - } - - return false; -} - -/* - * If there might be responses still in the TCP buffer, then we should try to - * use those, to reduce any TCP backpressure on the OS/PS side. - * - * This procedure handles that. - * - * Note that this works because we don't pipeline non-getPage requests. - * - * NOTE: This procedure is not allowed to throw errors that should be handled - * by SMGR-related code, as this can be called from every CHECK_FOR_INTERRUPTS - * point inside and outside PostgreSQL. - * - * This still does throw errors when it receives malformed responses from PS. - * - * When we're not called from CHECK_FOR_INTERRUPTS (indicated by - * IsHandlingInterrupts) we also report we've ended prefetch receive work, - * just in case state tracking was lost due to an error in the sync getPage - * response code. - */ -static void -prefetch_pump_state(bool IsHandlingInterrupts) -{ - while (MyPState->ring_receive != MyPState->ring_flush) - { - NeonResponse *response; - PrefetchRequest *slot; - MemoryContext old; - - slot = GetPrfSlot(MyPState->ring_receive); - - old = MemoryContextSwitchTo(MyPState->errctx); - response = page_server->try_receive(slot->shard_no); - MemoryContextSwitchTo(old); - - if (response == NULL) - break; - - /* The slot should still be valid */ - if (slot->status != PRFS_REQUESTED || - slot->response != NULL || - slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(slot->shard_no, ERROR, - "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", - slot->status, slot->response, - (long) slot->my_ring_index, (long) MyPState->ring_receive); - - /* update prefetch state */ - MyPState->n_responses_buffered += 1; - MyPState->n_requests_inflight -= 1; - MyPState->ring_receive += 1; - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - - /* update slot state */ - slot->status = PRFS_RECEIVED; - slot->response = response; - - if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) - { - /* - * Store prefetched result in LFC (please read comments to lfc_prefetch - * explaining why it can be done without holding shared buffer lock - */ - if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) - { - slot->flags |= PRFSF_LFC; - } - } - } - - /* We never pump the prefetch state while handling other pages */ - if (!IsHandlingInterrupts) - END_PREFETCH_RECEIVE_WORK(); - - reconfigure_timeout_if_needed(); -} - -void -readahead_buffer_resize(int newsize, void *extra) -{ - uint64 end, - nfree = newsize; - PrefetchState *newPState; - Size newprfs_size = offsetof(PrefetchState, prf_buffer) + - (sizeof(PrefetchRequest) * newsize); - - /* don't try to re-initialize if we haven't initialized yet */ - if (MyPState == NULL) - return; - - /* - * Make sure that we don't lose track of active prefetch requests by - * ensuring we have received all but the last n requests (n = newsize). - */ - if (MyPState->n_requests_inflight > newsize) - { - prefetch_wait_for(MyPState->ring_unused - newsize - 1); - Assert(MyPState->n_requests_inflight <= newsize); - } - - /* construct the new PrefetchState, and copy over the memory contexts */ - newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size); - - newPState->bufctx = MyPState->bufctx; - newPState->errctx = MyPState->errctx; - newPState->hashctx = MyPState->hashctx; - newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL); - newPState->n_unused = newsize; - newPState->n_requests_inflight = 0; - newPState->n_responses_buffered = 0; - newPState->ring_last = newsize; - newPState->ring_unused = newsize; - newPState->ring_receive = newsize; - newPState->max_shard_no = MyPState->max_shard_no; - memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap)); - - /* - * Copy over the prefetches. - * - * We populate the prefetch array from the end; to retain the most recent - * prefetches, but this has the benefit of only needing to do one - * iteration on the dataset, and trivial compaction. - */ - for (end = MyPState->ring_unused - 1; - end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0; - end -= 1) - { - PrefetchRequest *slot = GetPrfSlot(end); - PrefetchRequest *newslot; - bool found; - - if (slot->status == PRFS_UNUSED) - continue; - - nfree -= 1; - - newslot = &newPState->prf_buffer[nfree]; - *newslot = *slot; - newslot->my_ring_index = nfree; - - prfh_insert(newPState->prf_hash, newslot, &found); - - Assert(!found); - - switch (newslot->status) - { - case PRFS_UNUSED: - pg_unreachable(); - case PRFS_REQUESTED: - newPState->n_requests_inflight += 1; - newPState->ring_receive -= 1; - newPState->ring_last -= 1; - break; - case PRFS_RECEIVED: - newPState->n_responses_buffered += 1; - newPState->ring_last -= 1; - break; - case PRFS_TAG_REMAINS: - newPState->ring_last -= 1; - break; - } - newPState->n_unused -= 1; - } - newPState->ring_flush = newPState->ring_receive; - - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - MyNeonCounters->pageserver_open_requests = - MyPState->n_requests_inflight; - - for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) - { - PrefetchRequest *slot = GetPrfSlot(end); - Assert(slot->status != PRFS_REQUESTED); - if (slot->status == PRFS_RECEIVED) - { - pfree(slot->response); - } - } - - prfh_destroy(MyPState->prf_hash); - pfree(MyPState); - MyPState = newPState; -} - - - -/* - * Make sure that there are no responses still in the buffer. - * - * This function may indirectly update MyPState->pfs_hash; which invalidates - * any active pointers into the hash table. - */ -static void -consume_prefetch_responses(void) -{ - if (MyPState->ring_receive < MyPState->ring_unused) - prefetch_wait_for(MyPState->ring_unused - 1); -} - -static void -prefetch_cleanup_trailing_unused(void) -{ - uint64 ring_index; - PrefetchRequest *slot; - - while (MyPState->ring_last < MyPState->ring_receive) - { - ring_index = MyPState->ring_last; - slot = GetPrfSlot(ring_index); - - if (slot->status == PRFS_UNUSED) - MyPState->ring_last += 1; - else - break; - } -} - - -static bool -prefetch_flush_requests(void) -{ - for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++) - { - if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no)) - { - if (!page_server->flush(shard_no)) - return false; - BITMAP_CLR(MyPState->shard_bitmap, shard_no); - } - } - MyPState->max_shard_no = 0; - return true; -} - -/* - * Wait for slot of ring_index to have received its response. - * The caller is responsible for making sure the request buffer is flushed. - * - * NOTE: this function may indirectly update MyPState->pfs_hash; which - * invalidates any active pointers into the hash table. - * NOTE: callers should make sure they can handle query cancellations in this - * function's call path. - */ -static bool -prefetch_wait_for(uint64 ring_index) -{ - PrefetchRequest *entry; - bool result = true; - - if (MyPState->ring_flush <= ring_index && - MyPState->ring_unused > MyPState->ring_flush) - { - if (!prefetch_flush_requests()) - return false; - MyPState->ring_flush = MyPState->ring_unused; - } - - Assert(MyPState->ring_unused > ring_index); - - while (MyPState->ring_receive <= ring_index) - { - START_PREFETCH_RECEIVE_WORK(); - entry = GetPrfSlot(MyPState->ring_receive); - - Assert(entry->status == PRFS_REQUESTED); - if (!prefetch_read(entry)) - { - result = false; - break; - } - - END_PREFETCH_RECEIVE_WORK(); - CHECK_FOR_INTERRUPTS(); - } - - return result; -} - -/* - * Read the response of a prefetch request into its slot. - * - * The caller is responsible for making sure that the request for this buffer - * was flushed to the PageServer. - * - * NOTE: this function may indirectly update MyPState->pfs_hash; which - * invalidates any active pointers into the hash table. - * - * NOTE: this does IO, and can get canceled out-of-line. - */ -static bool -prefetch_read(PrefetchRequest *slot) -{ - NeonResponse *response; - MemoryContext old; - BufferTag buftag; - shardno_t shard_no; - uint64 my_ring_index; - - Assert(slot->status == PRFS_REQUESTED); - Assert(slot->response == NULL); - Assert(slot->my_ring_index == MyPState->ring_receive); - - if (slot->status != PRFS_REQUESTED || - slot->response != NULL || - slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(slot->shard_no, ERROR, - "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu", - slot->status, slot->response, - (long)slot->my_ring_index, (long)MyPState->ring_receive); - - /* - * Copy the request info so that if an error happens and the prefetch - * queue is flushed during the receive call, we can print the original - * values in the error message - */ - buftag = slot->buftag; - shard_no = slot->shard_no; - my_ring_index = slot->my_ring_index; - - old = MemoryContextSwitchTo(MyPState->errctx); - response = (NeonResponse *) page_server->receive(shard_no); - MemoryContextSwitchTo(old); - if (response) - { - /* The slot should still be valid */ - if (slot->status != PRFS_REQUESTED || - slot->response != NULL || - slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(shard_no, ERROR, - "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", - slot->status, slot->response, - (long) slot->my_ring_index, (long) MyPState->ring_receive); - - /* update prefetch state */ - MyPState->n_responses_buffered += 1; - MyPState->n_requests_inflight -= 1; - MyPState->ring_receive += 1; - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - - /* update slot state */ - slot->status = PRFS_RECEIVED; - slot->response = response; - - if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) - { - /* - * Store prefetched result in LFC (please read comments to lfc_prefetch - * explaining why it can be done without holding shared buffer lock - */ - if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) - { - slot->flags |= PRFSF_LFC; - } - } - return true; - } - else - { - /* - * Note: The slot might no longer be valid, if the connection was lost - * and the prefetch queue was flushed during the receive call - */ - neon_shard_log(shard_no, LOG, - "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect", - (long) my_ring_index, - RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)), - buftag.forkNum, buftag.blockNum); - return false; - } -} - -/* - * Disconnect hook - drop prefetches when the connection drops - * - * If we don't remove the failed prefetches, we'd be serving incorrect - * data to the smgr. - */ -void -prefetch_on_ps_disconnect(void) -{ - MyPState->ring_flush = MyPState->ring_unused; - - while (MyPState->ring_receive < MyPState->ring_unused) - { - PrefetchRequest *slot; - uint64 ring_index = MyPState->ring_receive; - - slot = GetPrfSlot(ring_index); - - Assert(slot->status == PRFS_REQUESTED); - Assert(slot->my_ring_index == ring_index); - - /* - * Drop connection to all shards which have prefetch requests. - * It is not a problem to call disconnect multiple times on the same connection - * because disconnect implementation in libpagestore.c will check if connection - * is alive and do nothing of connection was already dropped. - */ - page_server->disconnect(slot->shard_no); - - /* clean up the request */ - slot->status = PRFS_TAG_REMAINS; - MyPState->n_requests_inflight -= 1; - MyPState->ring_receive += 1; - - prefetch_set_unused(ring_index); - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total += 1; - } - - /* - * We can have gone into retry due to network error, so update stats with - * the latest available - */ - MyNeonCounters->pageserver_open_requests = - MyPState->n_requests_inflight; - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; -} - -/* - * prefetch_set_unused() - clear a received prefetch slot - * - * The slot at ring_index must be a current member of the ring buffer, - * and may not be in the PRFS_REQUESTED state. - * - * NOTE: this function will update MyPState->pfs_hash; which invalidates any - * active pointers into the hash table. - */ -static inline void -prefetch_set_unused(uint64 ring_index) -{ - PrefetchRequest *slot; - - if (ring_index < MyPState->ring_last) - return; /* Should already be unused */ - - slot = GetPrfSlot(ring_index); - if (slot->status == PRFS_UNUSED) - return; - - Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS); - - if (slot->status == PRFS_RECEIVED) - { - pfree(slot->response); - slot->response = NULL; - - MyPState->n_responses_buffered -= 1; - MyPState->n_unused += 1; - - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - } - else - { - Assert(slot->response == NULL); - } - - prfh_delete(MyPState->prf_hash, slot); - - /* clear all fields */ - MemSet(slot, 0, sizeof(PrefetchRequest)); - slot->status = PRFS_UNUSED; - - /* run cleanup if we're holding back ring_last */ - if (MyPState->ring_last == ring_index) - prefetch_cleanup_trailing_unused(); - - /* - * ... and try to store the buffered responses more compactly if > 12.5% - * of the buffer is gaps - */ - else if (ReceiveBufferNeedsCompaction()) - compact_prefetch_buffers(); -} - -/* - * Send one prefetch request to the pageserver. To wait for the response, call - * prefetch_wait_for(). - */ -static void -prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns) -{ - bool found; - uint64 mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index; - - NeonGetPageRequest request = { - .hdr.tag = T_NeonGetPageRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - /* lsn and not_modified_since are filled in below */ - .rinfo = BufTagGetNRelFileInfo(slot->buftag), - .forknum = slot->buftag.forkNum, - .blkno = slot->buftag.blockNum, - }; - - Assert(mySlotNo == MyPState->ring_unused); - - slot->reqid = request.hdr.reqid; - - if (force_request_lsns) - slot->request_lsns = *force_request_lsns; - else - neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, slot->buftag.blockNum, - &slot->request_lsns, 1); - request.hdr.lsn = slot->request_lsns.request_lsn; - request.hdr.not_modified_since = slot->request_lsns.not_modified_since; - - Assert(slot->response == NULL); - Assert(slot->my_ring_index == MyPState->ring_unused); - - while (!page_server->send(slot->shard_no, (NeonRequest *) &request)) - { - Assert(mySlotNo == MyPState->ring_unused); - /* loop */ - } - - /* update prefetch state */ - MyPState->n_requests_inflight += 1; - MyPState->n_unused -= 1; - MyPState->ring_unused += 1; - BITMAP_SET(MyPState->shard_bitmap, slot->shard_no); - MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no); - - /* update slot state */ - slot->status = PRFS_REQUESTED; - prfh_insert(MyPState->prf_hash, slot, &found); - Assert(!found); -} - -/* - * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted. - * Present pages are marked in "mask" bitmap and total number of such pages is returned. - */ -static int -prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, neon_request_lsns *lsns, - BlockNumber nblocks, void **buffers, bits8 *mask) -{ - int hits = 0; - PrefetchRequest hashkey; - - /* - * Use an intermediate PrefetchRequest struct as the hash key to ensure - * correct alignment and that the padding bytes are cleared. - */ - memset(&hashkey.buftag, 0, sizeof(BufferTag)); - CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); - hashkey.buftag.forkNum = forknum; - - for (int i = 0; i < nblocks; i++) - { - PrfHashEntry *entry; - - hashkey.buftag.blockNum = blocknum + i; - entry = prfh_lookup(MyPState->prf_hash, &hashkey); - - if (entry != NULL) - { - PrefetchRequest *slot = entry->slot; - uint64 ring_index = slot->my_ring_index; - Assert(slot == GetPrfSlot(ring_index)); - - Assert(slot->status != PRFS_UNUSED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); - Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); - - if (slot->status != PRFS_RECEIVED) - continue; - - /* - * If the caller specified a request LSN to use, only accept - * prefetch responses that satisfy that request. - */ - if (!neon_prefetch_response_usable(&lsns[i], slot)) - continue; - - /* - * Ignore errors - */ - if (slot->response->tag != T_NeonGetPageResponse) - { - if (slot->response->tag != T_NeonErrorResponse) - { - NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, - "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", - T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag); - } - continue; - } - memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); - - - /* - * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received - * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here - * under buffer lock. - */ - if (!lfc_store_prefetch_result) - lfc_write(rinfo, forknum, blocknum + i, buffers[i]); - - prefetch_set_unused(ring_index); - BITMAP_SET(mask, i); - - hits += 1; - inc_getpage_wait(0); - } - } - pgBufferUsage.prefetch.hits += hits; - return hits; -} - -#if PG_MAJORVERSION_NUM < 17 -static bool -prefetch_lookup(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkn, neon_request_lsns *lsns, void *buffer) -{ - bits8 present = 0; - return prefetch_lookupv(rinfo, forkNum, blkn, lsns, 1, &buffer, &present) != 0; -} -#endif - -/* - * prefetch_register_bufferv() - register and prefetch buffers - * - * Register that we may want the contents of BufferTag in the near future. - * This is used when issuing a speculative prefetch request, but also when - * performing a synchronous request and need the buffer right now. - * - * If force_request_lsns is not NULL, those values are sent to the - * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure - * to calculate the LSNs to send. - * - * When performing a prefetch rather than a synchronous request, - * is_prefetch==true. Currently, it only affects how the request is accounted - * in the perf counters. - * - * NOTE: this function may indirectly update MyPState->pfs_hash; which - * invalidates any active pointers into the hash table. - */ -static uint64 -prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, - BlockNumber nblocks, const bits8 *mask, - bool is_prefetch) -{ - uint64 min_ring_index; - PrefetchRequest hashkey; -#ifdef USE_ASSERT_CHECKING - bool any_hits = false; -#endif - /* We will never read further ahead than our buffer can store. */ - nblocks = Max(1, Min(nblocks, readahead_buffer_size)); - - /* - * Use an intermediate PrefetchRequest struct as the hash key to ensure - * correct alignment and that the padding bytes are cleared. - */ - memset(&hashkey.buftag, 0, sizeof(BufferTag)); - hashkey.buftag = tag; - -Retry: - /* - * We can have gone into retry due to network error, so update stats with - * the latest available - */ - MyNeonCounters->pageserver_open_requests = - MyPState->ring_unused - MyPState->ring_receive; - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - - min_ring_index = UINT64_MAX; - for (int i = 0; i < nblocks; i++) - { - PrefetchRequest *slot = NULL; - PrfHashEntry *entry = NULL; - uint64 ring_index; - neon_request_lsns *lsns; - - if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) - continue; - - if (frlsns) - lsns = &frlsns[i]; - else - lsns = NULL; - -#ifdef USE_ASSERT_CHECKING - any_hits = true; -#endif - - slot = NULL; - entry = NULL; - - hashkey.buftag.blockNum = tag.blockNum + i; - entry = prfh_lookup(MyPState->prf_hash, &hashkey); - - if (entry != NULL) - { - slot = entry->slot; - ring_index = slot->my_ring_index; - Assert(slot == GetPrfSlot(ring_index)); - - Assert(slot->status != PRFS_UNUSED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); - Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); - - /* - * If the caller specified a request LSN to use, only accept - * prefetch responses that satisfy that request. - */ - if (lsns) - { - if (!neon_prefetch_response_usable(lsns, slot)) - { - /* Wait for the old request to finish and discard it */ - if (!prefetch_wait_for(ring_index)) - goto Retry; - prefetch_set_unused(ring_index); - entry = NULL; - slot = NULL; - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total += 1; - } - } - - if (entry != NULL) - { - /* - * We received a prefetch for a page that was recently read - * and removed from the buffers. Remove that request from the - * buffers. - */ - if (slot->status == PRFS_TAG_REMAINS) - { - prefetch_set_unused(ring_index); - entry = NULL; - slot = NULL; - } - else - { - min_ring_index = Min(min_ring_index, ring_index); - /* The buffered request is good enough, return that index */ - if (is_prefetch) - pgBufferUsage.prefetch.duplicates++; - continue; - } - } - } - else if (!is_prefetch) - { - pgBufferUsage.prefetch.misses += 1; - MyNeonCounters->getpage_prefetch_misses_total++; - } - /* - * We can only leave the block above by finding that there's - * no entry that can satisfy this request, either because there - * was no entry, or because the entry was invalid or didn't satisfy - * the LSNs provided. - * - * The code should've made sure to clear up the data. - */ - Assert(entry == NULL); - Assert(slot == NULL); - - /* There should be no buffer overflow */ - Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused); - - /* - * If the prefetch queue is full, we need to make room by clearing the - * oldest slot. If the oldest slot holds a buffer that was already - * received, we can just throw it away; we fetched the page - * unnecessarily in that case. If the oldest slot holds a request that - * we haven't received a response for yet, we have to wait for the - * response to that before we can continue. We might not have even - * flushed the request to the pageserver yet, it might be just sitting - * in the output buffer. In that case, we flush it and wait for the - * response. (We could decide not to send it, but it's hard to abort - * when the request is already in the output buffer, and 'not sending' - * a prefetch request kind of goes against the principles of - * prefetching) - */ - if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused) - { - uint64 cleanup_index = MyPState->ring_last; - - slot = GetPrfSlot(cleanup_index); - - Assert(slot->status != PRFS_UNUSED); - - /* - * If there is good reason to run compaction on the prefetch buffers, - * try to do that. - */ - if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) - { - Assert(slot->status == PRFS_UNUSED); - } - else - { - /* - * We have the slot for ring_last, so that must still be in - * progress - */ - switch (slot->status) - { - case PRFS_REQUESTED: - Assert(MyPState->ring_receive == cleanup_index); - if (!prefetch_wait_for(cleanup_index)) - goto Retry; - prefetch_set_unused(cleanup_index); - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total += 1; - break; - case PRFS_RECEIVED: - case PRFS_TAG_REMAINS: - prefetch_set_unused(cleanup_index); - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total += 1; - break; - default: - pg_unreachable(); - } - } - } - - /* - * The next buffer pointed to by `ring_unused` is now definitely empty, so - * we can insert the new request to it. - */ - ring_index = MyPState->ring_unused; - - Assert(MyPState->ring_last <= ring_index && - ring_index <= MyPState->ring_unused); - - slot = GetPrfSlotNoCheck(ring_index); - - Assert(slot->status == PRFS_UNUSED); - - /* - * We must update the slot data before insertion, because the hash - * function reads the buffer tag from the slot. - */ - slot->buftag = hashkey.buftag; - slot->shard_no = get_shard_number(&tag); - slot->my_ring_index = ring_index; - slot->flags = 0; - - min_ring_index = Min(min_ring_index, ring_index); - - if (is_prefetch) - MyNeonCounters->getpage_prefetch_requests_total++; - else - MyNeonCounters->getpage_sync_requests_total++; - - prefetch_do_request(slot, lsns); - } - - MyNeonCounters->pageserver_open_requests = - MyPState->ring_unused - MyPState->ring_receive; - - Assert(any_hits); - - Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || - GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED); - Assert(MyPState->ring_last <= min_ring_index && - min_ring_index < MyPState->ring_unused); - - if (flush_every_n_requests > 0 && - MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) - { - if (!prefetch_flush_requests()) - { - /* - * Prefetch set is reset in case of error, so we should try to - * register our request once again - */ - goto Retry; - } - MyPState->ring_flush = MyPState->ring_unused; - } - - return min_ring_index; -} - -static bool -equal_requests(NeonRequest* a, NeonRequest* b) -{ - return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since; -} - - -/* - * Note: this function can get canceled and use a long jump to the next catch - * context. Take care. - */ -static NeonResponse * -page_server_request(void const *req) -{ - NeonResponse *resp; - BufferTag tag = {0}; - shardno_t shard_no; - - switch (messageTag(req)) - { - case T_NeonExistsRequest: - CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo); - break; - case T_NeonNblocksRequest: - CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo); - break; - case T_NeonDbSizeRequest: - NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode; - break; - case T_NeonGetPageRequest: - CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo); - tag.blockNum = ((NeonGetPageRequest *) req)->blkno; - break; - default: - neon_log(ERROR, "Unexpected request tag: %d", messageTag(req)); - } - shard_no = get_shard_number(&tag); - - /* - * Current sharding model assumes that all metadata is present only at shard 0. - * We still need to call get_shard_no() to check if shard map is up-to-date. - */ - if (((NeonRequest *) req)->tag != T_NeonGetPageRequest) - { - shard_no = 0; - } - - do - { - PG_TRY(); - { - while (!page_server->send(shard_no, (NeonRequest *) req) - || !page_server->flush(shard_no)) - { - /* do nothing */ - } - MyNeonCounters->pageserver_open_requests++; - consume_prefetch_responses(); - resp = page_server->receive(shard_no); - MyNeonCounters->pageserver_open_requests--; - } - PG_CATCH(); - { - /* - * Cancellation in this code needs to be handled better at some - * point, but this currently seems fine for now. - */ - page_server->disconnect(shard_no); - MyNeonCounters->pageserver_open_requests = 0; - - /* - * We know for sure we're not working on any prefetch pages after - * this. - */ - END_PREFETCH_RECEIVE_WORK(); - - PG_RE_THROW(); - } - PG_END_TRY(); - - } while (resp == NULL); - - return resp; -} - - -StringInfoData -nm_pack_request(NeonRequest *msg) -{ - StringInfoData s; - - initStringInfo(&s); - - pq_sendbyte(&s, msg->tag); - if (neon_protocol_version >= 3) - { - pq_sendint64(&s, msg->reqid); - } - pq_sendint64(&s, msg->lsn); - pq_sendint64(&s, msg->not_modified_since); - - switch (messageTag(msg)) - { - /* pagestore_client -> pagestore */ - case T_NeonExistsRequest: - { - NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; - - pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); - pq_sendbyte(&s, msg_req->forknum); - - break; - } - case T_NeonNblocksRequest: - { - NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; - - pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); - pq_sendbyte(&s, msg_req->forknum); - - break; - } - case T_NeonDbSizeRequest: - { - NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; - - pq_sendint32(&s, msg_req->dbNode); - - break; - } - case T_NeonGetPageRequest: - { - NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; - - pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); - pq_sendbyte(&s, msg_req->forknum); - pq_sendint32(&s, msg_req->blkno); - - break; - } - - case T_NeonGetSlruSegmentRequest: - { - NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; - - pq_sendbyte(&s, msg_req->kind); - pq_sendint32(&s, msg_req->segno); - - break; - } - - /* pagestore -> pagestore_client. We never need to create these. */ - case T_NeonExistsResponse: - case T_NeonNblocksResponse: - case T_NeonGetPageResponse: - case T_NeonErrorResponse: - case T_NeonDbSizeResponse: - case T_NeonGetSlruSegmentResponse: - default: - neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag); - break; - } - return s; -} - -NeonResponse * -nm_unpack_response(StringInfo s) -{ - NeonMessageTag tag = pq_getmsgbyte(s); - NeonResponse resp_hdr = {0}; /* make valgrind happy */ - NeonResponse *resp = NULL; - - resp_hdr.tag = tag; - if (neon_protocol_version >= 3) - { - resp_hdr.reqid = pq_getmsgint64(s); - resp_hdr.lsn = pq_getmsgint64(s); - resp_hdr.not_modified_since = pq_getmsgint64(s); - } - switch (tag) - { - /* pagestore -> pagestore_client */ - case T_NeonExistsResponse: - { - NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse)); - - if (neon_protocol_version >= 3) - { - NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - msg_resp->req.forknum = pq_getmsgbyte(s); - } - msg_resp->req.hdr = resp_hdr; - msg_resp->exists = pq_getmsgbyte(s); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonNblocksResponse: - { - NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse)); - - if (neon_protocol_version >= 3) - { - NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - msg_resp->req.forknum = pq_getmsgbyte(s); - } - msg_resp->req.hdr = resp_hdr; - msg_resp->n_blocks = pq_getmsgint(s, 4); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonGetPageResponse: - { - NeonGetPageResponse *msg_resp; - - msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE); - if (neon_protocol_version >= 3) - { - NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - msg_resp->req.forknum = pq_getmsgbyte(s); - msg_resp->req.blkno = pq_getmsgint(s, 4); - } - msg_resp->req.hdr = resp_hdr; - /* XXX: should be varlena */ - memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); - pq_getmsgend(s); - - Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonDbSizeResponse: - { - NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse)); - - if (neon_protocol_version >= 3) - { - msg_resp->req.dbNode = pq_getmsgint(s, 4); - } - msg_resp->req.hdr = resp_hdr; - msg_resp->db_size = pq_getmsgint64(s); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonErrorResponse: - { - NeonErrorResponse *msg_resp; - size_t msglen; - const char *msgtext; - - msgtext = pq_getmsgrawstring(s); - msglen = strlen(msgtext); - - msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1); - msg_resp->req = resp_hdr; - memcpy(msg_resp->message, msgtext, msglen + 1); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonGetSlruSegmentResponse: - { - NeonGetSlruSegmentResponse *msg_resp; - int n_blocks; - msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse)); - - if (neon_protocol_version >= 3) - { - msg_resp->req.kind = pq_getmsgbyte(s); - msg_resp->req.segno = pq_getmsgint(s, 4); - } - msg_resp->req.hdr = resp_hdr; - - n_blocks = pq_getmsgint(s, 4); - msg_resp->n_blocks = n_blocks; - memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - /* - * pagestore_client -> pagestore - * - * We create these ourselves, and don't need to decode them. - */ - case T_NeonExistsRequest: - case T_NeonNblocksRequest: - case T_NeonGetPageRequest: - case T_NeonDbSizeRequest: - case T_NeonGetSlruSegmentRequest: - default: - neon_log(ERROR, "unexpected neon message tag 0x%02x", tag); - break; - } - - return resp; -} - -/* dump to json for debugging / error reporting purposes */ -char * -nm_to_string(NeonMessage *msg) -{ - StringInfoData s; - - initStringInfo(&s); - - switch (messageTag(msg)) - { - /* pagestore_client -> pagestore */ - case T_NeonExistsRequest: - { - NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); - appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); - appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - - case T_NeonNblocksRequest: - { - NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); - appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); - appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - - case T_NeonGetPageRequest: - { - NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\""); - appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); - appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - case T_NeonDbSizeRequest: - { - NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); - appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - case T_NeonGetSlruSegmentRequest: - { - NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\""); - appendStringInfo(&s, ", \"kind\": %u", msg_req->kind); - appendStringInfo(&s, ", \"segno\": %u", msg_req->segno); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - /* pagestore -> pagestore_client */ - case T_NeonExistsResponse: - { - NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\""); - appendStringInfo(&s, ", \"exists\": %d}", - msg_resp->exists); - appendStringInfoChar(&s, '}'); - - break; - } - case T_NeonNblocksResponse: - { - NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\""); - appendStringInfo(&s, ", \"n_blocks\": %u}", - msg_resp->n_blocks); - appendStringInfoChar(&s, '}'); - - break; - } - case T_NeonGetPageResponse: - { -#if 0 - NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg; -#endif - - appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\""); - appendStringInfo(&s, ", \"page\": \"XXX\"}"); - appendStringInfoChar(&s, '}'); - break; - } - case T_NeonErrorResponse: - { - NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg; - - /* FIXME: escape double-quotes in the message */ - appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\""); - appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); - appendStringInfoChar(&s, '}'); - break; - } - case T_NeonDbSizeResponse: - { - NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\""); - appendStringInfo(&s, ", \"db_size\": %ld}", - msg_resp->db_size); - appendStringInfoChar(&s, '}'); - - break; - } - case T_NeonGetSlruSegmentResponse: - { - NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\""); - appendStringInfo(&s, ", \"n_blocks\": %u}", - msg_resp->n_blocks); - appendStringInfoChar(&s, '}'); - - break; - } - - default: - appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag); - } - return s.data; -} - /* * Wrapper around log_newpage() that makes a temporary copy of the block and * WAL-logs that. This makes it safe to use while holding only a shared lock @@ -1900,7 +213,6 @@ neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, log_pages = true; } else if (XLogInsertAllowed() && - !ShutdownRequestPending && (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM)) { log_pages = true; @@ -2149,11 +461,6 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co static void neon_init(void) { - Size prfs_size; - - if (MyPState != NULL) - return; - /* * Sanity check that theperf counters array is sized correctly. We got * this wrong once, and the formula for max number of backends and aux @@ -2168,27 +475,6 @@ neon_init(void) elog(ERROR, "MyNeonCounters points past end of array"); #endif - prfs_size = offsetof(PrefetchState, prf_buffer) + - sizeof(PrefetchRequest) * readahead_buffer_size; - - MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size); - - MyPState->n_unused = readahead_buffer_size; - - MyPState->bufctx = SlabContextCreate(TopMemoryContext, - "NeonSMGR/prefetch", - SLAB_DEFAULT_BLOCK_SIZE * 17, - PS_GETPAGERESPONSE_SIZE); - MyPState->errctx = AllocSetContextCreate(TopMemoryContext, - "NeonSMGR/errors", - ALLOCSET_DEFAULT_SIZES); - MyPState->hashctx = AllocSetContextCreate(TopMemoryContext, - "NeonSMGR/prefetch", - ALLOCSET_DEFAULT_SIZES); - - MyPState->prf_hash = prfh_create(MyPState->hashctx, - readahead_buffer_size, NULL); - old_redo_read_buffer_filter = redo_read_buffer_filter; redo_read_buffer_filter = neon_redo_read_buffer_filter; @@ -2225,8 +511,10 @@ nm_adjust_lsn(XLogRecPtr lsn) /* * Return LSN for requesting pages and number of blocks from page server + * + * XXX: exposed so that prefetch_do_request() can call back here. */ -static void +void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, neon_request_lsns *output, BlockNumber nblocks) { @@ -2381,7 +669,6 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, LSN_FORMAT_ARGS(last_written_lsn), LSN_FORMAT_ARGS(flushlsn)); XLogFlush(last_written_lsn); - flushlsn = last_written_lsn; } /* @@ -2397,131 +684,45 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * requesting the latest page, by setting request LSN to * UINT64_MAX. * - * Remember the current LSN, however, so that we can later - * correctly determine if the response to the request is still - * valid. The most up-to-date LSN we could use for that purpose - * would be the current insert LSN, but to avoid the overhead of - * looking it up, use 'flushlsn' instead. This relies on the - * assumption that if the page was modified since the last WAL - * flush, it should still be in the buffer cache, and we - * wouldn't be requesting it. + * effective_request_lsn is used to check that received response is still valid. + * In case of primary node it is last written LSN. Originally we used flush_lsn here, + * but it is not correct. Consider the following scenario: + * 1. Backend A wants to prefetch block X + * 2. Backend A checks that block X is not present in the shared buffer cache + * 3. Backend A calls prefetch_do_request, which calls neon_get_request_lsns + * 4. neon_get_request_lsns obtains LwLSN=11 for the block + * 5. Backend B downloads block X, updates and wallogs it with LSN=13 + * 6. Block X is once again evicted from shared buffers, its LwLSN is set to LSN=13 + * 7. Backend A is still executing in neon_get_request_lsns(). It calls 'flushlsn = GetFlushRecPtr();'. + * Let's say that it is LSN=14 + * 8. Backend A uses LSN=14 as effective_lsn in the prefetch slot. The request stored in the slot is + * [not_modified_since=11, effective_request_lsn=14] + * 9. Backend A sends the prefetch request, pageserver processes it, and sends response. + * The last LSN that the pageserver had processed was LSN=12, so the page image in the response is valid at LSN=12. + * 10. Backend A calls smgrread() for page X with LwLSN=13 + * 11. Backend A finds in prefetch ring the response for the prefetch request with [not_modified_since=11, effective_lsn=Lsn14], + * so it satisfies neon_prefetch_response_usable condition. + * + * Things go wrong in step 7-8, when [not_modified_since=11, effective_request_lsn=14] is determined for the request. + * That is incorrect, because the page has in fact been modified at LSN=13. The invariant is that for any request, + * there should not be any modifications to a page between its not_modified_since and (effective_)request_lsn values. + * + * The problem can be fixed by callingGetFlushRecPtr() before checking if the page is in the buffer cache. + * But you can't do that within smgrprefetch(), would need to modify the caller. */ result->request_lsn = UINT64_MAX; result->not_modified_since = last_written_lsn; - result->effective_request_lsn = flushlsn; + result->effective_request_lsn = last_written_lsn; } } } -/* - * neon_prefetch_response_usable -- Can a new request be satisfied by old one? - * - * This is used to check if the response to a prefetch request can be used to - * satisfy a page read now. - */ -static bool -neon_prefetch_response_usable(neon_request_lsns *request_lsns, - PrefetchRequest *slot) -{ - /* sanity check the LSN's on the old and the new request */ - Assert(request_lsns->request_lsn >= request_lsns->not_modified_since); - Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since); - Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn); - Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); - Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); - Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); - Assert(slot->status != PRFS_UNUSED); - - /* - * The new request's LSN should never be older than the old one. This - * could be an Assert, except that for testing purposes, we do provide an - * interface in neon_test_utils to fetch pages at arbitary LSNs, which - * violates this. - * - * Similarly, the not_modified_since value calculated for a page should - * never move backwards. This assumption is a bit fragile; if we updated - * the last-written cache when we read in a page, for example, then it - * might. But as the code stands, it should not. - * - * (If two backends issue a request at the same time, they might race and - * calculate LSNs "out of order" with each other, but the prefetch queue - * is backend-private at the moment.) - */ - if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn || - request_lsns->not_modified_since < slot->request_lsns.not_modified_since) - { - ereport(LOG, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "request with unexpected LSN after prefetch"), - errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", - LSN_FORMAT_ARGS(request_lsns->effective_request_lsn), - LSN_FORMAT_ARGS(request_lsns->not_modified_since), - LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), - LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); - return false; - } - - /*--- - * Each request to the pageserver has three LSN values associated with it: - * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'. - * `not_modified_since` and `request_lsn` are sent to the pageserver, but - * in the primary node, we always use UINT64_MAX as the `request_lsn`, so - * we remember `effective_request_lsn` separately. In a primary, - * `effective_request_lsn` is the last flush WAL position when the request - * was sent to the pageserver. That's logically the LSN that we are - * requesting the page at, but we send UINT64_MAX to the pageserver so - * that if the GC horizon advances past that position, we still get a - * valid response instead of an error. - * - * To determine whether a response to a GetPage request issued earlier is - * still valid to satisfy a new page read, we look at the - * (not_modified_since, effective_request_lsn] range of the request. It is - * effectively a claim that the page has not been modified between those - * LSNs. If the range of the old request in the queue overlaps with the - * new request, we know that the page hasn't been modified in the union of - * the ranges. We can use the response to old request to satisfy the new - * request in that case. For example: - * - * 100 500 - * Old request: +--------+ - * - * 400 800 - * New request: +--------+ - * - * The old request claims that the page was not modified between LSNs 100 - * and 500, and the second claims that it was not modified between 400 and - * 800. Together they mean that the page was not modified between 100 and - * 800. Therefore the response to the old request is also valid for the - * new request. - * - * This logic also holds at the boundary case that the old request's LSN - * matches the new request's not_modified_since LSN exactly: - * - * 100 500 - * Old request: +--------+ - * - * 500 900 - * New request: +--------+ - * - * The response to the old request is the page as it was at LSN 500, and - * the page hasn't been changed in the range (500, 900], therefore the - * response is valid also for the new request. - */ - - /* this follows from the checks above */ - Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since); - - return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn; -} - /* * neon_exists() -- Does the physical file exist? */ static bool neon_exists(SMgrRelation reln, ForkNumber forkNum) { - bool exists; - NeonResponse *resp; BlockNumber n_blocks; neon_request_lsns request_lsns; @@ -2580,67 +781,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); - { - NeonExistsRequest request = { - .hdr.tag = T_NeonExistsRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - .hdr.lsn = request_lsns.request_lsn, - .hdr.not_modified_since = request_lsns.not_modified_since, - .rinfo = InfoFromSMgrRel(reln), - .forknum = forkNum - }; - resp = page_server_request(&request); - - switch (resp->tag) - { - case T_NeonExistsResponse: - { - NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp; - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr) || - !RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) || - exists_resp->req.forknum != request.forknum) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum, - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum); - } - } - exists = exists_resp->exists; - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr)) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", - resp->reqid, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", - T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); - } - pfree(resp); - } - return exists; + return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns); } /* @@ -2989,7 +1131,6 @@ static bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks) { - uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; BufferTag tag; switch (reln->smgr_relpersistence) @@ -3026,20 +1167,13 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, tag.blockNum = blocknum; - for (int i = 0; i < PG_IOV_MAX / 8; i++) - lfc_present[i] = ~(lfc_present[i]); - - ring_index = prefetch_register_bufferv(tag, NULL, iterblocks, - lfc_present, true); + communicator_prefetch_register_bufferv(tag, NULL, iterblocks, lfc_present); nblocks -= iterblocks; blocknum += iterblocks; - - Assert(ring_index < MyPState->ring_unused && - MyPState->ring_last <= ring_index); } - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); return false; } @@ -3052,7 +1186,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, static bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { - uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; BufferTag tag; switch (reln->smgr_relpersistence) @@ -3077,12 +1210,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln)); - ring_index = prefetch_register_bufferv(tag, NULL, 1, NULL, true); + communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); - Assert(ring_index < MyPState->ring_unused && - MyPState->ring_last <= ring_index); - - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); return false; } @@ -3126,7 +1256,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, */ neon_log(SmgrTrace, "writeback noop"); - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3134,218 +1264,15 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, #endif } -static void -#if PG_MAJORVERSION_NUM < 16 -neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, - char **buffers, BlockNumber nblocks, const bits8 *mask) -#else -neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, - void **buffers, BlockNumber nblocks, const bits8 *mask) -#endif -{ - NeonResponse *resp; - uint64 ring_index; - PrfHashEntry *entry; - PrefetchRequest *slot; - PrefetchRequest hashkey; - - Assert(PointerIsValid(request_lsns)); - Assert(nblocks >= 1); - - /* - * Use an intermediate PrefetchRequest struct as the hash key to ensure - * correct alignment and that the padding bytes are cleared. - */ - memset(&hashkey.buftag, 0, sizeof(BufferTag)); - CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); - hashkey.buftag.forkNum = forkNum; - hashkey.buftag.blockNum = base_blockno; - - /* - * The redo process does not lock pages that it needs to replay but are - * not in the shared buffers, so a concurrent process may request the page - * after redo has decided it won't redo that page and updated the LwLSN - * for that page. If we're in hot standby we need to take care that we - * don't return until after REDO has finished replaying up to that LwLSN, - * as the page should have been locked up to that point. - * - * See also the description on neon_redo_read_buffer_filter below. - * - * NOTE: It is possible that the WAL redo process will still do IO due to - * concurrent failed read IOs. Those IOs should never have a request_lsn - * that is as large as the WAL record we're currently replaying, if it - * weren't for the behaviour of the LwLsn cache that uses the highest - * value of the LwLsn cache when the entry is not found. - */ - prefetch_register_bufferv(hashkey.buftag, request_lsns, nblocks, mask, false); - - for (int i = 0; i < nblocks; i++) - { - void *buffer = buffers[i]; - BlockNumber blockno = base_blockno + i; - neon_request_lsns *reqlsns = &request_lsns[i]; - TimestampTz start_ts, end_ts; - - if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) - continue; - - start_ts = GetCurrentTimestamp(); - - if (RecoveryInProgress() && MyBackendType != B_STARTUP) - XLogWaitForReplayOf(reqlsns->request_lsn); - - /* - * Try to find prefetched page in the list of received pages. - */ -Retry: - hashkey.buftag.blockNum = blockno; - entry = prfh_lookup(MyPState->prf_hash, &hashkey); - - if (entry != NULL) - { - slot = entry->slot; - if (neon_prefetch_response_usable(reqlsns, slot)) - { - ring_index = slot->my_ring_index; - } - else - { - /* - * Cannot use this prefetch, discard it - * - * We can't drop cache for not-yet-received requested items. It is - * unlikely this happens, but it can happen if prefetch distance - * is large enough and a backend didn't consume all prefetch - * requests. - */ - if (slot->status == PRFS_REQUESTED) - { - if (!prefetch_wait_for(slot->my_ring_index)) - goto Retry; - } - /* drop caches */ - prefetch_set_unused(slot->my_ring_index); - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total++; - /* make it look like a prefetch cache miss */ - entry = NULL; - } - } - - do - { - if (entry == NULL) - { - ring_index = prefetch_register_bufferv(hashkey.buftag, reqlsns, 1, NULL, false); - Assert(ring_index != UINT64_MAX); - slot = GetPrfSlot(ring_index); - } - else - { - /* - * Empty our reference to the prefetch buffer's hash entry. When - * we wait for prefetches, the entry reference is invalidated by - * potential updates to the hash, and when we reconnect to the - * pageserver the prefetch we're waiting for may be dropped, in - * which case we need to retry and take the branch above. - */ - entry = NULL; - } - - Assert(slot->my_ring_index == ring_index); - Assert(MyPState->ring_last <= ring_index && - MyPState->ring_unused > ring_index); - Assert(slot->status != PRFS_UNUSED); - Assert(GetPrfSlot(ring_index) == slot); - - } while (!prefetch_wait_for(ring_index)); - - Assert(slot->status == PRFS_RECEIVED); - Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0); - Assert(hashkey.buftag.blockNum == base_blockno + i); - - resp = slot->response; - - switch (resp->tag) - { - case T_NeonGetPageResponse: - { - NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp; - if (neon_protocol_version >= 3) - { - if (resp->reqid != slot->reqid || - resp->lsn != slot->request_lsns.request_lsn || - resp->not_modified_since != slot->request_lsns.not_modified_since || - !RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) || - getpage_resp->req.forknum != forkNum || - getpage_resp->req.blkno != base_blockno + i) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno, - slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i); - } - } - memcpy(buffer, getpage_resp->page, BLCKSZ); - - /* - * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received - * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here - * under buffer lock. - */ - if (!lfc_store_prefetch_result) - lfc_write(rinfo, forkNum, blockno, buffer); - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (resp->reqid != slot->reqid || - resp->lsn != slot->request_lsns.request_lsn || - resp->not_modified_since != slot->request_lsns.not_modified_since) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", - slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo), - forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - default: - NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, - "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", - T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); - } - - /* buffer was used, clean up for later reuse */ - prefetch_set_unused(ring_index); - prefetch_cleanup_trailing_unused(); - - end_ts = GetCurrentTimestamp(); - inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0); - } -} - /* * While function is defined in the neon extension it's used within neon_test_utils directly. * To avoid breaking tests in the runtime please keep function signature in sync. */ void -#if PG_MAJORVERSION_NUM < 16 -neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, char *buffer) -#else neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer) -#endif { - neon_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); + communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); } #if PG_MAJORVERSION_NUM < 17 @@ -3361,6 +1288,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer #endif { neon_request_lsns request_lsns; + bits8 present; + void *bufferp; switch (reln->smgr_relpersistence) { @@ -3380,11 +1309,13 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } /* Try to read PS results if they are available */ - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); - if (prefetch_lookup(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, buffer)) + present = 0; + bufferp = buffer; + if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present)) { /* Prefetch hit */ return; @@ -3402,7 +1333,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -3485,9 +1416,7 @@ static void neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks) { - bits8 prefetch_hits[PG_IOV_MAX / 8] = {0}; - bits8 lfc_hits[PG_IOV_MAX / 8]; - bits8 read[PG_IOV_MAX / 8]; + bits8 read_pages[PG_IOV_MAX / 8]; neon_request_lsns request_lsns[PG_IOV_MAX]; int lfc_result; int prefetch_result; @@ -3514,24 +1443,23 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, nblocks, PG_IOV_MAX); /* Try to read PS results if they are available */ - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks); + memset(read_pages, 0, sizeof(read_pages)); - prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks, buffers, prefetch_hits); + prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum, + blocknum, request_lsns, nblocks, + buffers, read_pages); if (prefetch_result == nblocks) return; - /* invert the result: exclude prefetched blocks */ - for (int i = 0; i < PG_IOV_MAX / 8; i++) - lfc_hits[i] = ~prefetch_hits[i]; - /* Try to read from local file cache */ lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, - nblocks, lfc_hits); + nblocks, read_pages); if (lfc_result > 0) MyNeonCounters->file_cache_hits_total += lfc_result; @@ -3540,26 +1468,13 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if (prefetch_result + lfc_result == nblocks) return; - if (lfc_result <= 0) - { - /* can't use the LFC result, so read all blocks from PS */ - for (int i = 0; i < PG_IOV_MAX / 8; i++) - read[i] = ~prefetch_hits[i]; - } - else - { - /* invert the result: exclude blocks read from lfc */ - for (int i = 0; i < PG_IOV_MAX / 8; i++) - read[i] = ~(prefetch_hits[i] | lfc_hits[i]); - } - - neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, - buffers, nblocks, read); + communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, + buffers, nblocks, read_pages); /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -3572,7 +1487,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, for (int i = 0; i < nblocks; i++) { BlockNumber blkno = blocknum + i; - if (!BITMAP_ISSET(read, i)) + if (!BITMAP_ISSET(read_pages, i)) continue; #if PG_MAJORVERSION_NUM >= 17 @@ -3695,6 +1610,9 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo #ifndef DEBUG_COMPARE_LOCAL /* This is a bit tricky. Check if the relation exists locally */ if (mdexists(reln, forknum)) +#else + if (mdexists(reln, INIT_FORKNUM)) +#endif { /* It exists locally. Guess it's unlogged then. */ #if PG_MAJORVERSION_NUM >= 17 @@ -3711,7 +1629,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo */ return; } -#endif break; case RELPERSISTENCE_PERMANENT: @@ -3742,7 +1659,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3768,6 +1685,9 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, #ifndef DEBUG_COMPARE_LOCAL /* This is a bit tricky. Check if the relation exists locally */ if (mdexists(reln, forknum)) +#else + if (mdexists(reln, INIT_FORKNUM)) +#endif { /* It exists locally. Guess it's unlogged then. */ mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); @@ -3781,7 +1701,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, */ return; } -#endif break; case RELPERSISTENCE_PERMANENT: @@ -3802,7 +1721,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3818,7 +1737,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum) { - NeonResponse *resp; BlockNumber n_blocks; neon_request_lsns request_lsns; @@ -3850,74 +1768,15 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); - { - NeonNblocksRequest request = { - .hdr.tag = T_NeonNblocksRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - .hdr.lsn = request_lsns.request_lsn, - .hdr.not_modified_since = request_lsns.not_modified_since, - .rinfo = InfoFromSMgrRel(reln), - .forknum = forknum, - }; + n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns); + update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); - resp = page_server_request(&request); + neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), + n_blocks); - switch (resp->tag) - { - case T_NeonNblocksResponse: - { - NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp; - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr) || - !RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) || - relsize_resp->req.forknum != forknum) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum, - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum); - } - } - n_blocks = relsize_resp->n_blocks; - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr)) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", - resp->reqid, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", - T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); - } - update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); - - neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), - n_blocks); - - pfree(resp); - } return n_blocks; } @@ -3927,7 +1786,6 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) int64 neon_dbsize(Oid dbNode) { - NeonResponse *resp; int64 db_size; neon_request_lsns request_lsns; NRelFileInfo dummy_node = {0}; @@ -3935,66 +1793,11 @@ neon_dbsize(Oid dbNode) neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); - { - NeonDbSizeRequest request = { - .hdr.tag = T_NeonDbSizeRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - .hdr.lsn = request_lsns.request_lsn, - .hdr.not_modified_since = request_lsns.not_modified_since, - .dbNode = dbNode, - }; + db_size = communicator_dbsize(dbNode, &request_lsns); - resp = page_server_request(&request); + neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); - switch (resp->tag) - { - case T_NeonDbSizeResponse: - { - NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp; - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr) || - dbsize_resp->req.dbNode != dbNode) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode, - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode); - } - } - db_size = dbsize_resp->db_size; - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr)) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X", - resp->reqid, - dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", - T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); - } - - neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", - dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); - - pfree(resp); - } return db_size; } @@ -4093,7 +1896,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -4195,6 +1998,8 @@ neon_start_unlogged_build(SMgrRelation reln) #ifndef DEBUG_COMPARE_LOCAL if (!IsParallelWorker()) mdcreate(reln, MAIN_FORKNUM, false); +#else + mdcreate(reln, INIT_FORKNUM, false); #endif } @@ -4273,6 +2078,8 @@ neon_end_unlogged_build(SMgrRelation reln) #ifndef DEBUG_COMPARE_LOCAL /* use isRedo == true, so that we drop it immediately */ mdunlink(rinfob, forknum, true); +#else + mdunlink(rinfob, INIT_FORKNUM, true); #endif } } @@ -4290,9 +2097,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf not_modified_since; SlruKind kind; int n_blocks; - shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ - NeonResponse *resp; - NeonGetSlruSegmentRequest request; + neon_request_lsns request_lsns; /* * Compute a request LSN to use, similar to neon_get_request_lsns() but the @@ -4331,74 +2136,12 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf else return -1; - request = (NeonGetSlruSegmentRequest) { - .hdr.tag = T_NeonGetSlruSegmentRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - .hdr.lsn = request_lsn, - .hdr.not_modified_since = not_modified_since, - .kind = kind, - .segno = segno - }; + request_lsns.request_lsn = request_lsn; + request_lsns.not_modified_since = not_modified_since; + request_lsns.effective_request_lsn = request_lsn; - do - { - while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no)); + n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer); - consume_prefetch_responses(); - - resp = page_server->receive(shard_no); - } while (resp == NULL); - - switch (resp->tag) - { - case T_NeonGetSlruSegmentResponse: - { - NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp; - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr) || - slru_resp->req.kind != kind || - slru_resp->req.segno != segno) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno, - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, segno); - } - } - n_blocks = slru_resp->n_blocks; - memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ); - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr)) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %d at lsn %X/%08X", - resp->reqid, - kind, - segno, - LSN_FORMAT_ARGS(request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x", - T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag); - } - pfree(resp); - - reconfigure_timeout_if_needed(); return n_blocks; } @@ -4434,7 +2177,7 @@ AtEOXact_neon(XactEvent event, void *arg) } break; } - reconfigure_timeout_if_needed(); + communicator_reconfigure_timeout_if_needed(); } static const struct f_smgr neon_smgr = @@ -4492,6 +2235,7 @@ smgr_init_neon(void) smgr_init_standard(); neon_init(); + communicator_init(); } @@ -4521,25 +2265,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * This length is later reused when we open the smgr to read the * block, which is fine and expected. */ - NeonResponse *response; - NeonNblocksResponse *nbresponse; - NeonNblocksRequest request = { - .hdr = (NeonRequest) { - .tag = T_NeonNblocksRequest, - .reqid = GENERATE_REQUEST_ID(), - .lsn = end_recptr, - .not_modified_since = end_recptr, - }, - .rinfo = rinfo, - .forknum = forknum, - }; + neon_request_lsns request_lsns; - response = page_server_request(&request); + neon_get_request_lsns(rinfo, forknum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); - Assert(response->tag == T_NeonNblocksResponse); - nbresponse = (NeonNblocksResponse *) response; + relsize = communicator_nblocks(rinfo, forknum, &request_lsns); - relsize = Max(nbresponse->n_blocks, blkno + 1); + relsize = Max(relsize, blkno + 1); set_cached_relsize(rinfo, forknum, relsize); neon_set_lwlsn_relation(end_recptr, rinfo, forknum); @@ -4691,94 +2424,3 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) } return no_redo_needed; } - -static void -reconfigure_timeout_if_needed(void) -{ - bool needs_set = MyPState->ring_receive != MyPState->ring_unused && - readahead_getpage_pull_timeout_ms > 0; - - if (needs_set != timeout_set) - { - /* The background writer doens't (shouldn't) read any pages */ - Assert(!AmBackgroundWriterProcess()); - /* The checkpointer doens't (shouldn't) read any pages */ - Assert(!AmCheckpointerProcess()); - - if (unlikely(PS_TIMEOUT_ID == 0)) - { - PS_TIMEOUT_ID = RegisterTimeout(USER_TIMEOUT, pagestore_timeout_handler); - } - - if (needs_set) - { -#if PG_MAJORVERSION_NUM <= 14 - enable_timeout_after(PS_TIMEOUT_ID, readahead_getpage_pull_timeout_ms); -#else - enable_timeout_every( - PS_TIMEOUT_ID, - TimestampTzPlusMilliseconds(GetCurrentTimestamp(), - readahead_getpage_pull_timeout_ms), - readahead_getpage_pull_timeout_ms - ); -#endif - timeout_set = true; - } - else - { - Assert(timeout_set); - disable_timeout(PS_TIMEOUT_ID, false); - timeout_set = false; - } - } -} - -static void -pagestore_timeout_handler(void) -{ -#if PG_MAJORVERSION_NUM <= 14 - /* - * PG14: Setting a repeating timeout is not possible, so we signal here - * that the timeout has already been reset, and by telling the system - * that system will re-schedule it later if we need to. - */ - timeout_set = false; -#endif - timeout_signaled = true; - InterruptPending = true; -} - -static process_interrupts_callback_t prev_interrupt_cb; - -/* - * Process new data received in our active PageStream sockets. - * - * This relies on the invariant that all pipelined yet-to-be-received requests - * are getPage requests managed by MyPState. This is currently true, any - * modification will probably require some stuff to make it work again. - */ -static bool -pagestore_smgr_processinterrupts(void) -{ - if (timeout_signaled) - { - if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0) - prefetch_pump_state(true); - - timeout_signaled = false; - reconfigure_timeout_if_needed(); - } - - if (!prev_interrupt_cb) - return false; - - return prev_interrupt_cb(); -} - - -void -pagestore_smgr_init(void) -{ - prev_interrupt_cb = ProcessInterruptsCallback; - ProcessInterruptsCallback = pagestore_smgr_processinterrupts; -} diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index 2a4c2dc799..60ca1675d9 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -6,10 +6,6 @@ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * - * IDENTIFICATION - * contrib/neon/relsize_cache.c - * *------------------------------------------------------------------------- */ #include "postgres.h" diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 0336d63e8d..b95b1451e4 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -99,6 +99,9 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp->config = config; wp->api = api; wp->state = WPS_COLLECTING_TERMS; + wp->mconf.generation = INVALID_GENERATION; + wp->mconf.members.len = 0; + wp->mconf.new_members.len = 0; wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list); @@ -170,6 +173,8 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) if (wp->config->proto_version != 2 && wp->config->proto_version != 3) wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version); + if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3) + wp_log(FATAL, "enabling generations requires protocol version 3"); wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version); /* Fill the greeting package */ @@ -214,7 +219,7 @@ WalProposerFree(WalProposer *wp) static bool WalProposerGenerationsEnabled(WalProposer *wp) { - return wp->safekeepers_generation != 0; + return wp->safekeepers_generation != INVALID_GENERATION; } /* @@ -723,13 +728,176 @@ SendProposerGreeting(Safekeeper *sk) BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV); } +/* + * Assuming `sk` sent its node id, find such member(s) in wp->mconf and set ptr in + * members_safekeepers & new_members_safekeepers to sk. + */ +static void +UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk) +{ + /* members_safekeepers etc are fixed size, sanity check mconf size */ + if (wp->mconf.members.len > MAX_SAFEKEEPERS) + wp_log(FATAL, "too many members %d in mconf", wp->mconf.members.len); + if (wp->mconf.new_members.len > MAX_SAFEKEEPERS) + wp_log(FATAL, "too many new_members %d in mconf", wp->mconf.new_members.len); + + /* node id is not known until greeting is received */ + if (sk->state < SS_WAIT_VOTING) + return; + + /* 0 is assumed to be invalid node id, should never happen */ + if (sk->greetResponse.nodeId == 0) + { + wp_log(WARNING, "safekeeper %s:%s sent zero node id", sk->host, sk->port); + return; + } + + for (uint32 i = 0; i < wp->mconf.members.len; i++) + { + SafekeeperId *sk_id = &wp->mconf.members.m[i]; + + if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId) + { + /* + * If mconf or list of safekeepers to connect to changed (the + * latter always currently goes through restart though), + * ResetMemberSafekeeperPtrs is expected to be called before + * UpdateMemberSafekeeperPtr. So, other value suggests that we are + * connected to the same sk under different host name, complain + * about that. + */ + if (wp->members_safekeepers[i] != NULL && wp->members_safekeepers[i] != sk) + { + wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in members[%u] is already mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, wp->members_safekeepers[i] - wp->safekeeper); + } + wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in members[%u] mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper); + wp->members_safekeepers[i] = sk; + } + } + /* repeat for new_members */ + for (uint32 i = 0; i < wp->mconf.new_members.len; i++) + { + SafekeeperId *sk_id = &wp->mconf.new_members.m[i]; + + if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId) + { + if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk) + { + wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] is already mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, wp->new_members_safekeepers[i] - wp->safekeeper); + } + wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper); + wp->new_members_safekeepers[i] = sk; + } + } +} + +/* + * Reset wp->members_safekeepers & new_members_safekeepers and refill them. + * Called after wp changes mconf. + */ +static void +ResetMemberSafekeeperPtrs(WalProposer *wp) +{ + memset(&wp->members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS); + memset(&wp->new_members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS); + for (int i = 0; i < wp->n_safekeepers; i++) + { + if (wp->safekeeper[i].state >= SS_WAIT_VOTING) + UpdateMemberSafekeeperPtr(wp, &wp->safekeeper[i]); + } +} + +static uint32 +MsetQuorum(MemberSet *mset) +{ + Assert(mset->len > 0); + return mset->len / 2 + 1; +} + +/* Does n forms quorum in mset? */ +static bool +MsetHasQuorum(MemberSet *mset, uint32 n) +{ + return n >= MsetQuorum(mset); +} + +/* + * TermsCollected helper for a single member set `mset`. + * + * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers + * or new_members_safekeepers. + */ +static bool +TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s) +{ + uint32 n_greeted = 0; + + for (uint32 i = 0; i < wp->mconf.members.len; i++) + { + Safekeeper *sk = msk[i]; + + if (sk != NULL && sk->state == SS_WAIT_VOTING) + { + if (n_greeted > 0) + appendStringInfoString(s, ", "); + appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port); + n_greeted++; + } + } + appendStringInfo(s, ", %u/%u total", n_greeted, mset->len); + return MsetHasQuorum(mset, n_greeted); +} + /* * Have we received greeting from enough (quorum) safekeepers to start voting? */ static bool TermsCollected(WalProposer *wp) { - return wp->n_connected >= wp->quorum; + StringInfoData s; /* str for logging */ + bool collected = false; + + /* legacy: generations disabled */ + if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) + { + collected = wp->n_connected >= wp->quorum; + if (collected) + { + wp->propTerm++; + wp_log(LOG, "walproposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT ", starting voting", wp->quorum, wp->propTerm); + } + return collected; + } + + /* + * With generations enabled, we start campaign only when 1) some mconf is + * actually received 2) we have greetings from majority of members as well + * as from majority of new_members if it exists. + */ + if (wp->mconf.generation == INVALID_GENERATION) + return false; + + initStringInfo(&s); + appendStringInfoString(&s, "mset greeters: "); + if (!TermsCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s)) + goto res; + if (wp->mconf.new_members.len > 0) + { + appendStringInfoString(&s, ", new_mset greeters: "); + if (!TermsCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s)) + goto res; + } + wp->propTerm++; + wp_log(LOG, "walproposer connected to quorum of safekeepers: %s, propTerm=" INT64_FORMAT ", starting voting", s.data, wp->propTerm); + collected = true; + +res: + pfree(s.data); + return collected; } static void @@ -753,13 +921,41 @@ RecvAcceptorGreeting(Safekeeper *sk) pfree(mconf_toml); /* - * Adopt mconf of safekeepers if it is higher. TODO: mconf change should - * restart wp if it started voting. + * Adopt mconf of safekeepers if it is higher. */ if (sk->greetResponse.mconf.generation > wp->mconf.generation) { + /* sanity check before adopting, should never happen */ + if (sk->greetResponse.mconf.members.len == 0) + { + wp_log(FATAL, "mconf %u has zero members", sk->greetResponse.mconf.generation); + } + + /* + * If we at least started campaign, restart wp to get elected in the + * new mconf. Note: in principle once wp is already elected + * re-election is not required, but being conservative here is not + * bad. + * + * TODO: put mconf to shmem to immediately pick it up on start, + * otherwise if some safekeeper(s) misses latest mconf and gets + * connected the first, it may cause redundant restarts here. + * + * More generally, it would be nice to restart walproposer (wiping + * election state) without restarting the process. In particular, that + * would allow sync-safekeepers not to die here if it intersected with + * sk migration (as well as remove 1s delay). + * + * Note that assign_neon_safekeepers also currently restarts the + * process, so during normal migration walproposer may restart twice. + */ + if (wp->state >= WPS_CAMPAIGN) + { + wp_log(FATAL, "restarting to adopt mconf generation %d", sk->greetResponse.mconf.generation); + } MembershipConfigurationFree(&wp->mconf); MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf); + ResetMemberSafekeeperPtrs(wp); /* full conf was just logged above */ wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation); } @@ -767,6 +963,9 @@ RecvAcceptorGreeting(Safekeeper *sk) /* Protocol is all good, move to voting. */ sk->state = SS_WAIT_VOTING; + /* In greeting safekeeper sent its id; update mappings accordingly. */ + UpdateMemberSafekeeperPtr(wp, sk); + /* * Note: it would be better to track the counter on per safekeeper basis, * but at worst walproposer would restart with 'term rejected', so leave @@ -778,12 +977,9 @@ RecvAcceptorGreeting(Safekeeper *sk) /* We're still collecting terms from the majority. */ wp->propTerm = Max(sk->greetResponse.term, wp->propTerm); - /* Quorum is acquried, prepare the vote request. */ + /* Quorum is acquired, prepare the vote request. */ if (TermsCollected(wp)) { - wp->propTerm++; - wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); - wp->state = WPS_CAMPAIGN; wp->voteRequest.pam.tag = 'v'; wp->voteRequest.generation = wp->mconf.generation; @@ -832,8 +1028,8 @@ SendVoteRequest(Safekeeper *sk) &sk->outbuf, wp->config->proto_version); /* We have quorum for voting, send our vote request */ - wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port, - wp->voteRequest.generation, wp->voteRequest.term); + wp_log(LOG, "requesting vote from sk {id = %lu, ep = %s:%s} for generation %u term " UINT64_FORMAT, + sk->greetResponse.nodeId, sk->host, sk->port, wp->voteRequest.generation, wp->voteRequest.term); /* On failure, logging & resetting is handled */ BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT); /* If successful, wait for read-ready with SS_WAIT_VERDICT */ @@ -851,8 +1047,8 @@ RecvVoteResponse(Safekeeper *sk) return; wp_log(LOG, - "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term, + "got VoteResponse from sk {id = %lu, ep = %s:%s}, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X", + sk->greetResponse.nodeId, sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), @@ -899,6 +1095,53 @@ RecvVoteResponse(Safekeeper *sk) } } +/* + * VotesCollected helper for a single member set `mset`. + * + * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers + * or new_members_safekeepers. + */ +static bool +VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s) +{ + uint32 n_votes = 0; + + for (uint32 i = 0; i < wp->mconf.members.len; i++) + { + Safekeeper *sk = msk[i]; + + if (sk != NULL && sk->state == SS_WAIT_ELECTED) + { + Assert(sk->voteResponse.voteGiven); + + /* + * Find the highest vote. NULL check is for the legacy case where + * safekeeper might be not initialized with LSN at all and return + * 0 LSN in the vote response; we still want to set donor to + * something in this case. + */ + if (GetLastLogTerm(sk) > wp->donorLastLogTerm || + (GetLastLogTerm(sk) == wp->donorLastLogTerm && + sk->voteResponse.flushLsn > wp->propTermStartLsn) || + wp->donor == NULL) + { + wp->donorLastLogTerm = GetLastLogTerm(sk); + wp->propTermStartLsn = sk->voteResponse.flushLsn; + wp->donor = sk; + } + wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); + + if (n_votes > 0) + appendStringInfoString(s, ", "); + appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port); + n_votes++; + } + } + appendStringInfo(s, ", %u/%u total", n_votes, mset->len); + return MsetHasQuorum(mset, n_votes); +} + + /* * Checks if enough votes has been collected to get elected and if that's the * case finds the highest vote, setting donor, donorLastLogTerm, @@ -907,7 +1150,8 @@ RecvVoteResponse(Safekeeper *sk) static bool VotesCollected(WalProposer *wp) { - int n_ready = 0; + StringInfoData s; /* str for logging */ + bool collected = false; /* assumed to be called only when not elected yet */ Assert(wp->state == WPS_CAMPAIGN); @@ -916,25 +1160,62 @@ VotesCollected(WalProposer *wp) wp->donorLastLogTerm = 0; wp->truncateLsn = InvalidXLogRecPtr; - for (int i = 0; i < wp->n_safekeepers; i++) + /* legacy: generations disabled */ + if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) { - if (wp->safekeeper[i].state == SS_WAIT_ELECTED) - { - n_ready++; + int n_ready = 0; - if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm || - (GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm && - wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn)) + for (int i = 0; i < wp->n_safekeepers; i++) + { + if (wp->safekeeper[i].state == SS_WAIT_ELECTED) { - wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]); - wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn; - wp->donor = i; + n_ready++; + + if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm || + (GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm && + wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn) || + wp->donor == NULL) + { + wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]); + wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn; + wp->donor = &wp->safekeeper[i]; + } + wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); } - wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); } + collected = n_ready >= wp->quorum; + if (collected) + { + wp_log(LOG, "walproposer elected with %d/%d votes", n_ready, wp->n_safekeepers); + } + return collected; } - return n_ready >= wp->quorum; + /* + * if generations are enabled we're expected to get to voting only when + * mconf is established. + */ + Assert(wp->mconf.generation != INVALID_GENERATION); + + /* + * We must get votes from both msets if both are present. + */ + initStringInfo(&s); + appendStringInfoString(&s, "mset voters: "); + if (!VotesCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s)) + goto res; + if (wp->mconf.new_members.len > 0) + { + appendStringInfoString(&s, ", new_mset voters: "); + if (!VotesCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s)) + goto res; + } + wp_log(LOG, "walproposer elected, %s", s.data); + collected = true; + +res: + pfree(s.data); + return collected; } /* @@ -955,7 +1236,7 @@ HandleElectedProposer(WalProposer *wp) * that only for logical replication (and switching logical walsenders to * neon_walreader is a todo.) */ - if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor])) + if (!wp->api.recovery_download(wp, wp->donor)) { wp_log(FATAL, "failed to download WAL for logical replicaiton"); } @@ -1078,7 +1359,7 @@ ProcessPropStartPos(WalProposer *wp) /* * Proposer's term history is the donor's + its own entry. */ - dth = &wp->safekeeper[wp->donor].voteResponse.termHistory; + dth = &wp->donor->voteResponse.termHistory; wp->propTermHistory.n_entries = dth->n_entries + 1; wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries); if (dth->n_entries > 0) @@ -1086,11 +1367,10 @@ ProcessPropStartPos(WalProposer *wp) wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm; wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn; - wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", - wp->quorum, + wp_log(LOG, "walproposer elected in term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", wp->propTerm, LSN_FORMAT_ARGS(wp->propTermStartLsn), - wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port, + wp->donor->host, wp->donor->port, LSN_FORMAT_ARGS(wp->truncateLsn)); /* @@ -1508,6 +1788,14 @@ RecvAppendResponses(Safekeeper *sk) readAnything = true; + /* should never happen: sk is expected to send ERROR instead */ + if (sk->appendResponse.generation != wp->mconf.generation) + { + wp_log(FATAL, "safekeeper {id = %lu, ep = %s:%s} sent response with generation %u, expected %u", + sk->greetResponse.nodeId, sk->host, sk->port, + sk->appendResponse.generation, wp->mconf.generation); + } + if (sk->appendResponse.term > wp->propTerm) { /* @@ -1624,30 +1912,101 @@ CalculateMinFlushLsn(WalProposer *wp) } /* - * Calculate WAL position acknowledged by quorum + * GetAcknowledgedByQuorumWALPosition for a single member set `mset`. + * + * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers + * or new_members_safekeepers. */ static XLogRecPtr -GetAcknowledgedByQuorumWALPosition(WalProposer *wp) +GetCommittedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk) { XLogRecPtr responses[MAX_SAFEKEEPERS]; /* - * Sort acknowledged LSNs + * Ascending sort acknowledged LSNs. */ - for (int i = 0; i < wp->n_safekeepers; i++) + Assert(mset->len <= MAX_SAFEKEEPERS); + for (uint32 i = 0; i < mset->len; i++) { + Safekeeper *sk = msk[i]; + /* * Like in Raft, we aren't allowed to commit entries from previous - * terms, so ignore reported LSN until it gets to epochStartLsn. + * terms, so ignore reported LSN until it gets to propTermStartLsn. + * + * Note: we ignore sk state, which is ok: before first ack flushLsn is + * 0, and later we just preserve value across reconnections. It would + * be ok to check for SS_ACTIVE as well. */ - responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0; + if (sk != NULL && sk->appendResponse.flushLsn >= wp->propTermStartLsn) + { + responses[i] = sk->appendResponse.flushLsn; + } + else + { + responses[i] = 0; + } } - qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn); + qsort(responses, mset->len, sizeof(XLogRecPtr), CompareLsn); /* - * Get the smallest LSN committed by quorum + * And get value committed by the quorum. A way to view this: to get the + * highest value committed on the quorum, in the ordered array we skip n - + * n_quorum elements to get to the first (lowest) value present on all sks + * of the highest quorum. */ - return responses[wp->n_safekeepers - wp->quorum]; + return responses[mset->len - MsetQuorum(mset)]; +} + +/* + * Calculate WAL position acknowledged by quorum, i.e. which may be regarded + * committed. + * + * Zero may be returned when there is no quorum of nodes recovered to term start + * lsn which sent feedback yet. + */ +static XLogRecPtr +GetAcknowledgedByQuorumWALPosition(WalProposer *wp) +{ + XLogRecPtr committed; + + /* legacy: generations disabled */ + if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) + { + XLogRecPtr responses[MAX_SAFEKEEPERS]; + + /* + * Sort acknowledged LSNs + */ + for (int i = 0; i < wp->n_safekeepers; i++) + { + /* + * Like in Raft, we aren't allowed to commit entries from previous + * terms, so ignore reported LSN until it gets to + * propTermStartLsn. + * + * Note: we ignore sk state, which is ok: before first ack + * flushLsn is 0, and later we just preserve value across + * reconnections. It would be ok to check for SS_ACTIVE as well. + */ + responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0; + } + qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn); + + /* + * Get the smallest LSN committed by quorum + */ + return responses[wp->n_safekeepers - wp->quorum]; + } + + committed = GetCommittedMset(wp, &wp->mconf.members, wp->members_safekeepers); + if (wp->mconf.new_members.len > 0) + { + XLogRecPtr new_mset_committed = GetCommittedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers); + + committed = Min(committed, new_mset_committed); + } + return committed; } /* @@ -1662,7 +2021,7 @@ UpdateDonorShmem(WalProposer *wp) int i; XLogRecPtr donor_lsn = InvalidXLogRecPtr; - if (wp->n_votes < wp->quorum) + if (wp->state < WPS_ELECTED) { wp_log(WARNING, "UpdateDonorShmem called before elections are won"); return; @@ -1673,9 +2032,9 @@ UpdateDonorShmem(WalProposer *wp) * about its position immediately after election before any feedbacks are * sent. */ - if (wp->safekeeper[wp->donor].state >= SS_WAIT_ELECTED) + if (wp->donor->state >= SS_WAIT_ELECTED) { - donor = &wp->safekeeper[wp->donor]; + donor = wp->donor; donor_lsn = wp->propTermStartLsn; } @@ -1746,22 +2105,19 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk) } /* - * Generally sync is done when majority switched the epoch so we committed - * epochStartLsn and made the majority aware of it, ensuring they are - * ready to give all WAL to pageserver. It would mean whichever majority - * is alive, there will be at least one safekeeper who is able to stream - * WAL to pageserver to make basebackup possible. However, since at the - * moment we don't have any good mechanism of defining the healthy and - * most advanced safekeeper who should push the wal into pageserver and + * Generally sync is done when majority reached propTermStartLsn so we + * committed it and made the majority aware of it, ensuring they are ready + * to give all WAL to pageserver. It would mean whichever majority is + * alive, there will be at least one safekeeper who is able to stream WAL + * to pageserver to make basebackup possible. However, since at the moment + * we don't have any good mechanism of defining the healthy and most + * advanced safekeeper who should push the wal into pageserver and * basically the random one gets connected, to prevent hanging basebackup * (due to pageserver connecting to not-synced-safekeeper) we currently * wait for all seemingly alive safekeepers to get synced. */ if (wp->config->syncSafekeepers) { - int n_synced; - - n_synced = 0; for (int i = 0; i < wp->n_safekeepers; i++) { Safekeeper *sk = &wp->safekeeper[i]; @@ -1770,11 +2126,9 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk) /* alive safekeeper which is not synced yet; wait for it */ if (sk->state != SS_OFFLINE && !synced) return; - if (synced) - n_synced++; } - if (n_synced >= wp->quorum) + if (newCommitLsn >= wp->propTermStartLsn) { /* A quorum of safekeepers has been synced! */ diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index d116bce806..648b0015ad 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -145,6 +145,7 @@ typedef uint64 NNodeId; * This and following structs pair ones in membership.rs. */ typedef uint32 Generation; +#define INVALID_GENERATION 0 typedef struct SafekeeperId { @@ -771,7 +772,17 @@ typedef struct WalProposer /* Current walproposer membership configuration */ MembershipConfiguration mconf; - /* (n_safekeepers / 2) + 1 */ + /* + * Parallels mconf.members with pointers to the member's slot in + * safekeepers array of connections, or NULL if such member is not + * connected. Helps to avoid looking slot per id through all + * .safekeepers[] when doing quorum checks. + */ + Safekeeper *members_safekeepers[MAX_SAFEKEEPERS]; + /* As above, but for new_members. */ + Safekeeper *new_members_safekeepers[MAX_SAFEKEEPERS]; + + /* (n_safekeepers / 2) + 1. Used for static pre-generations quorum checks. */ int quorum; /* @@ -829,7 +840,7 @@ typedef struct WalProposer term_t donorLastLogTerm; /* Most advanced acceptor */ - int donor; + Safekeeper *donor; /* timeline globally starts at this LSN */ XLogRecPtr timelineStartLsn; diff --git a/pgxn/neon/walproposer_compat.c b/pgxn/neon/walproposer_compat.c index a986160224..b9460feb21 100644 --- a/pgxn/neon/walproposer_compat.c +++ b/pgxn/neon/walproposer_compat.c @@ -7,6 +7,7 @@ #include +#include "libpq/pqformat.h" #include "miscadmin.h" #include "utils/datetime.h" #include "walproposer.h" diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 0b5499ca53..d37412f674 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -50,13 +50,8 @@ PG_FUNCTION_INFO_V1(trigger_segfault); * Linkage to functions in neon module. * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c */ -#if PG_MAJORVERSION_NUM < 16 -typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, char *buffer); -#else typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer); -#endif static neon_read_at_lsn_type neon_read_at_lsn_ptr; diff --git a/poetry.lock b/poetry.lock index 96c65fdf05..08732fd641 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1286,24 +1286,20 @@ files = [ [[package]] name = "h2" -version = "4.1.0" +version = "4.2.0" description = "Pure-Python HTTP/2 protocol implementation" optional = false python-versions = ">=3.9" groups = ["main"] -files = [] -develop = false +files = [ + {file = "h2-4.2.0-py3-none-any.whl", hash = "sha256:479a53ad425bb29af087f3458a61d30780bc818e4ebcf01f0b536ba916462ed0"}, + {file = "h2-4.2.0.tar.gz", hash = "sha256:c8a52129695e88b1a0578d8d2cc6842bbd79128ac685463b887ee278126ad01f"}, +] [package.dependencies] hpack = ">=4.1,<5" hyperframe = ">=6.1,<7" -[package.source] -type = "git" -url = "https://github.com/python-hyper/h2" -reference = "HEAD" -resolved_reference = "0b98b244b5fd1fe96100ac14905417a3b70a4286" - [[package]] name = "hpack" version = "4.1.0" @@ -3844,4 +3840,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "fb50cb6b291169dce3188560cdb31a14af95647318f8f0f0d718131dbaf1817a" +content-hash = "7ab1e7b975af34b3271b7c6018fa22a261d3f73c7c0a0403b6b2bb86b5fbd36e" diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 62fdc18207..e03f2f33d9 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -509,7 +509,14 @@ pub async fn run() -> anyhow::Result<()> { if let Some(mut redis_kv_client) = redis_kv_client { maintenance_tasks.spawn(async move { redis_kv_client.try_connect().await?; - handle_cancel_messages(&mut redis_kv_client, rx_cancel).await + handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?; + + drop(redis_kv_client); + + // `handle_cancel_messages` was terminated due to the tx_cancel + // being dropped. this is not worthy of an error, and this task can only return `Err`, + // so let's wait forever instead. + std::future::pending().await }); } diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 8263e5aa2a..c5ba04eb8c 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,16 +1,17 @@ -use std::convert::Infallible; use std::net::{IpAddr, SocketAddr}; use std::sync::Arc; +use anyhow::{Context, anyhow}; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; use postgres_client::CancelToken; use postgres_client::tls::MakeTlsConnect; use pq_proto::CancelKeyData; +use redis::{FromRedisValue, Pipeline, Value, pipe}; use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::{mpsc, oneshot}; -use tracing::{debug, info}; +use tracing::{debug, info, warn}; use crate::auth::backend::ComputeUserInfo; use crate::auth::{AuthError, check_peer_addr_is_in_list}; @@ -30,6 +31,7 @@ type IpSubnetKey = IpNet; const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10); +const BATCH_SIZE: usize = 8; // Message types for sending through mpsc channel pub enum CancelKeyOp { @@ -54,78 +56,168 @@ pub enum CancelKeyOp { }, } +impl CancelKeyOp { + fn register(self, pipe: &mut Pipeline) -> Option { + #[allow(clippy::used_underscore_binding)] + match self { + CancelKeyOp::StoreCancelKey { + key, + field, + value, + resp_tx, + _guard, + expire, + } => { + pipe.hset(&key, field, value); + pipe.expire(key, expire); + let resp_tx = resp_tx?; + Some(CancelReplyOp::StoreCancelKey { resp_tx, _guard }) + } + CancelKeyOp::GetCancelData { + key, + resp_tx, + _guard, + } => { + pipe.hgetall(key); + Some(CancelReplyOp::GetCancelData { resp_tx, _guard }) + } + CancelKeyOp::RemoveCancelKey { + key, + field, + resp_tx, + _guard, + } => { + pipe.hdel(key, field); + let resp_tx = resp_tx?; + Some(CancelReplyOp::RemoveCancelKey { resp_tx, _guard }) + } + } + } +} + +// Message types for sending through mpsc channel +pub enum CancelReplyOp { + StoreCancelKey { + resp_tx: oneshot::Sender>, + _guard: CancelChannelSizeGuard<'static>, + }, + GetCancelData { + resp_tx: oneshot::Sender>>, + _guard: CancelChannelSizeGuard<'static>, + }, + RemoveCancelKey { + resp_tx: oneshot::Sender>, + _guard: CancelChannelSizeGuard<'static>, + }, +} + +impl CancelReplyOp { + fn send_err(self, e: anyhow::Error) { + match self { + CancelReplyOp::StoreCancelKey { resp_tx, _guard } => { + resp_tx + .send(Err(e)) + .inspect_err(|_| tracing::debug!("could not send reply")) + .ok(); + } + CancelReplyOp::GetCancelData { resp_tx, _guard } => { + resp_tx + .send(Err(e)) + .inspect_err(|_| tracing::debug!("could not send reply")) + .ok(); + } + CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => { + resp_tx + .send(Err(e)) + .inspect_err(|_| tracing::debug!("could not send reply")) + .ok(); + } + } + } + + fn send_value(self, v: redis::Value) { + match self { + CancelReplyOp::StoreCancelKey { resp_tx, _guard } => { + let send = + FromRedisValue::from_owned_redis_value(v).context("could not parse value"); + resp_tx + .send(send) + .inspect_err(|_| tracing::debug!("could not send reply")) + .ok(); + } + CancelReplyOp::GetCancelData { resp_tx, _guard } => { + let send = + FromRedisValue::from_owned_redis_value(v).context("could not parse value"); + resp_tx + .send(send) + .inspect_err(|_| tracing::debug!("could not send reply")) + .ok(); + } + CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => { + let send = + FromRedisValue::from_owned_redis_value(v).context("could not parse value"); + resp_tx + .send(send) + .inspect_err(|_| tracing::debug!("could not send reply")) + .ok(); + } + } + } +} + // Running as a separate task to accept messages through the rx channel -// In case of problems with RTT: switch to recv_many() + redis pipeline pub async fn handle_cancel_messages( client: &mut RedisKVClient, mut rx: mpsc::Receiver, -) -> anyhow::Result { +) -> anyhow::Result<()> { + let mut batch = Vec::new(); + let mut replies = vec![]; + loop { - if let Some(msg) = rx.recv().await { - match msg { - CancelKeyOp::StoreCancelKey { - key, - field, - value, - resp_tx, - _guard, - expire, - } => { - let res = client.hset(&key, field, value).await; - if let Some(resp_tx) = resp_tx { - if res.is_ok() { - resp_tx - .send(client.expire(key, expire).await) - .inspect_err(|e| { - tracing::debug!( - "failed to send StoreCancelKey response: {:?}", - e - ); - }) - .ok(); - } else { - resp_tx - .send(res) - .inspect_err(|e| { - tracing::debug!( - "failed to send StoreCancelKey response: {:?}", - e - ); - }) - .ok(); - } - } else if res.is_ok() { - drop(client.expire(key, expire).await); - } else { - tracing::warn!("failed to store cancel key: {:?}", res); - } + if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 { + warn!("shutting down cancellation queue"); + break Ok(()); + } + + let batch_size = batch.len(); + debug!(batch_size, "running cancellation jobs"); + + let mut pipe = pipe(); + for msg in batch.drain(..) { + if let Some(reply) = msg.register(&mut pipe) { + replies.push(reply); + } else { + pipe.ignore(); + } + } + + let responses = replies.len(); + + match client.query(pipe).await { + // for each reply, we expect that many values. + Ok(Value::Array(values)) if values.len() == responses => { + debug!( + batch_size, + responses, "successfully completed cancellation jobs", + ); + for (value, reply) in std::iter::zip(values, replies.drain(..)) { + reply.send_value(value); } - CancelKeyOp::GetCancelData { - key, - resp_tx, - _guard, - } => { - drop(resp_tx.send(client.hget_all(key).await)); + } + Ok(value) => { + debug!(?value, "unexpected redis return value"); + for reply in replies.drain(..) { + reply.send_err(anyhow!("incorrect response type from redis")); } - CancelKeyOp::RemoveCancelKey { - key, - field, - resp_tx, - _guard, - } => { - if let Some(resp_tx) = resp_tx { - resp_tx - .send(client.hdel(key, field).await) - .inspect_err(|e| { - tracing::debug!("failed to send StoreCancelKey response: {:?}", e); - }) - .ok(); - } else { - drop(client.hdel(key, field).await); - } + } + Err(err) => { + for reply in replies.drain(..) { + reply.send_err(anyhow!("could not send cmd to redis: {err}")); } } } + + replies.clear(); } } @@ -425,12 +517,7 @@ impl CancelClosure { &mut mk_tls, &self.hostname, ) - .map_err(|e| { - CancelError::IO(std::io::Error::new( - std::io::ErrorKind::Other, - e.to_string(), - )) - })?; + .map_err(|e| CancelError::IO(std::io::Error::other(e.to_string())))?; self.cancel_token.cancel_query_raw(socket, tls).await?; debug!("query was cancelled"); diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 2c3e70138d..2268e60d25 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -568,7 +568,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn fn helper_create_connect_info( mechanism: &TestConnectMechanism, ) -> auth::Backend<'static, ComputeCredentials> { - let user_info = auth::Backend::ControlPlane( + auth::Backend::ControlPlane( MaybeOwned::Owned(ControlPlaneClient::Test(Box::new(mechanism.clone()))), ComputeCredentials { info: ComputeUserInfo { @@ -578,8 +578,7 @@ fn helper_create_connect_info( }, keys: ComputeCredentialKeys::Password("password".into()), }, - ); - user_info + ) } fn config() -> ComputeConfig { diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs index 3689bf7ae2..aa627b29a6 100644 --- a/proxy/src/redis/kv_ops.rs +++ b/proxy/src/redis/kv_ops.rs @@ -1,4 +1,5 @@ -use redis::{AsyncCommands, ToRedisArgs}; +use redis::aio::ConnectionLike; +use redis::{Cmd, FromRedisValue, Pipeline, RedisResult}; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; @@ -8,6 +9,23 @@ pub struct RedisKVClient { limiter: GlobalRateLimiter, } +#[allow(async_fn_in_trait)] +pub trait Queryable { + async fn query(&self, conn: &mut impl ConnectionLike) -> RedisResult; +} + +impl Queryable for Pipeline { + async fn query(&self, conn: &mut impl ConnectionLike) -> RedisResult { + self.query_async(conn).await + } +} + +impl Queryable for Cmd { + async fn query(&self, conn: &mut impl ConnectionLike) -> RedisResult { + self.query_async(conn).await + } +} + impl RedisKVClient { pub fn new(client: ConnectionWithCredentialsProvider, info: &'static [RateBucketInfo]) -> Self { Self { @@ -27,158 +45,24 @@ impl RedisKVClient { Ok(()) } - pub(crate) async fn hset(&mut self, key: K, field: F, value: V) -> anyhow::Result<()> - where - K: ToRedisArgs + Send + Sync, - F: ToRedisArgs + Send + Sync, - V: ToRedisArgs + Send + Sync, - { - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping hset"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - - match self.client.hset(&key, &field, &value).await { - Ok(()) => return Ok(()), - Err(e) => { - tracing::error!("failed to set a key-value pair: {e}"); - } - } - - tracing::info!("Redis client is disconnected. Reconnectiong..."); - self.try_connect().await?; - self.client - .hset(key, field, value) - .await - .map_err(anyhow::Error::new) - } - - #[allow(dead_code)] - pub(crate) async fn hset_multiple( + pub(crate) async fn query( &mut self, - key: &str, - items: &[(K, V)], - ) -> anyhow::Result<()> - where - K: ToRedisArgs + Send + Sync, - V: ToRedisArgs + Send + Sync, - { + q: impl Queryable, + ) -> anyhow::Result { if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping hset_multiple"); + tracing::info!("Rate limit exceeded. Skipping query"); return Err(anyhow::anyhow!("Rate limit exceeded")); } - match self.client.hset_multiple(key, items).await { - Ok(()) => return Ok(()), + match q.query(&mut self.client).await { + Ok(t) => return Ok(t), Err(e) => { - tracing::error!("failed to set a key-value pair: {e}"); + tracing::error!("failed to run query: {e}"); } } - tracing::info!("Redis client is disconnected. Reconnectiong..."); + tracing::info!("Redis client is disconnected. Reconnecting..."); self.try_connect().await?; - self.client - .hset_multiple(key, items) - .await - .map_err(anyhow::Error::new) - } - - #[allow(dead_code)] - pub(crate) async fn expire(&mut self, key: K, seconds: i64) -> anyhow::Result<()> - where - K: ToRedisArgs + Send + Sync, - { - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping expire"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - - match self.client.expire(&key, seconds).await { - Ok(()) => return Ok(()), - Err(e) => { - tracing::error!("failed to set a key-value pair: {e}"); - } - } - - tracing::info!("Redis client is disconnected. Reconnectiong..."); - self.try_connect().await?; - self.client - .expire(key, seconds) - .await - .map_err(anyhow::Error::new) - } - - #[allow(dead_code)] - pub(crate) async fn hget(&mut self, key: K, field: F) -> anyhow::Result - where - K: ToRedisArgs + Send + Sync, - F: ToRedisArgs + Send + Sync, - V: redis::FromRedisValue, - { - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping hget"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - - match self.client.hget(&key, &field).await { - Ok(value) => return Ok(value), - Err(e) => { - tracing::error!("failed to get a value: {e}"); - } - } - - tracing::info!("Redis client is disconnected. Reconnectiong..."); - self.try_connect().await?; - self.client - .hget(key, field) - .await - .map_err(anyhow::Error::new) - } - - pub(crate) async fn hget_all(&mut self, key: K) -> anyhow::Result - where - K: ToRedisArgs + Send + Sync, - V: redis::FromRedisValue, - { - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping hgetall"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - - match self.client.hgetall(&key).await { - Ok(value) => return Ok(value), - Err(e) => { - tracing::error!("failed to get a value: {e}"); - } - } - - tracing::info!("Redis client is disconnected. Reconnectiong..."); - self.try_connect().await?; - self.client.hgetall(key).await.map_err(anyhow::Error::new) - } - - pub(crate) async fn hdel(&mut self, key: K, field: F) -> anyhow::Result<()> - where - K: ToRedisArgs + Send + Sync, - F: ToRedisArgs + Send + Sync, - { - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping hdel"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - - match self.client.hdel(&key, &field).await { - Ok(()) => return Ok(()), - Err(e) => { - tracing::error!("failed to delete a key-value pair: {e}"); - } - } - - tracing::info!("Redis client is disconnected. Reconnectiong..."); - self.try_connect().await?; - self.client - .hdel(key, field) - .await - .map_err(anyhow::Error::new) + Ok(q.query(&mut self.client).await?) } } diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index 77b548cc43..42a3ea17a2 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -47,6 +47,7 @@ impl ConnInfo { } #[derive(Clone)] +#[allow(clippy::large_enum_variant, reason = "TODO")] pub(crate) enum ClientDataEnum { Remote(ClientDataRemote), Local(ClientDataLocal), diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index c958d077fc..3282c0ebde 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -41,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; pub(crate) const EXT_NAME: &str = "pg_session_jwt"; -pub(crate) const EXT_VERSION: &str = "0.2.0"; +pub(crate) const EXT_VERSION: &str = "0.3.0"; pub(crate) const EXT_SCHEMA: &str = "auth"; #[derive(Clone)] diff --git a/pyproject.toml b/pyproject.toml index c5129fac35..c6dfdc223c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ websockets = "^12.0" clickhouse-connect = "^0.7.16" kafka-python = "^2.0.2" jwcrypto = "^1.5.6" -h2 = {git = "https://github.com/python-hyper/h2"} +h2 = "^4.2.0" types-jwcrypto = "^1.5.0.20240925" pyyaml = "^6.0.2" types-pyyaml = "^6.0.12.20240917" diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 591d60ea79..a0d5970bd5 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.85.0" +channel = "1.86.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index afef5e792e..5849df0343 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -115,13 +115,17 @@ impl Client { "{}/v1/tenant/{}/timeline/{}", self.mgmt_api_endpoint, tenant_id, timeline_id ); - let resp = self.request(Method::DELETE, &uri, ()).await?; + let resp = self + .request_maybe_body(Method::DELETE, &uri, None::<()>) + .await?; resp.json().await.map_err(Error::ReceiveBody) } - pub async fn delete_tenant(&self, tenant_id: TenantId) -> Result { + pub async fn delete_tenant(&self, tenant_id: TenantId) -> Result { let uri = format!("{}/v1/tenant/{}", self.mgmt_api_endpoint, tenant_id); - let resp = self.request(Method::DELETE, &uri, ()).await?; + let resp = self + .request_maybe_body(Method::DELETE, &uri, None::<()>) + .await?; resp.json().await.map_err(Error::ReceiveBody) } @@ -197,6 +201,16 @@ impl Client { method: Method, uri: U, body: B, + ) -> Result { + self.request_maybe_body(method, uri, Some(body)).await + } + + /// Send the request and check that the status code is good, with an optional body. + async fn request_maybe_body( + &self, + method: Method, + uri: U, + body: Option, ) -> Result { let res = self.request_noerror(method, uri, body).await?; let response = res.error_from_body().await?; @@ -208,12 +222,15 @@ impl Client { &self, method: Method, uri: U, - body: B, + body: Option, ) -> Result { let mut req = self.client.request(method, uri); if let Some(value) = &self.authorization_header { req = req.header(reqwest::header::AUTHORIZATION, value.get_contents()) } - req.json(&body).send().await.map_err(Error::ReceiveBody) + if let Some(body) = body { + req = req.json(&body); + } + req.send().await.map_err(Error::ReceiveBody) } } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 6ce43815a6..b8c122ea72 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -219,7 +219,13 @@ struct Args { pub ssl_cert_reload_period: Duration, /// Trusted root CA certificates to use in https APIs. #[arg(long)] - ssl_ca_file: Option, + pub ssl_ca_file: Option, + /// Flag to use https for requests to peer's safekeeper API. + #[arg(long)] + pub use_https_safekeeper_api: bool, + /// Path to the JWT auth token used to authenticate with other safekeepers. + #[arg(long)] + auth_token_path: Option, } // Like PathBufValueParser, but allows empty string. @@ -338,14 +344,24 @@ async fn main() -> anyhow::Result<()> { }; // Load JWT auth token to connect to other safekeepers for pull_timeline. + // First check if the env var is present, then check the arg with the path. + // We want to deprecate and remove the env var method in the future. let sk_auth_token = match var("SAFEKEEPER_AUTH_TOKEN") { Ok(v) => { info!("loaded JWT token for authentication with safekeepers"); Some(SecretString::from(v)) } Err(VarError::NotPresent) => { - info!("no JWT token for authentication with safekeepers detected"); - None + if let Some(auth_token_path) = args.auth_token_path.as_ref() { + info!( + "loading JWT token for authentication with safekeepers from {auth_token_path}" + ); + let auth_token = tokio::fs::read_to_string(auth_token_path).await?; + Some(SecretString::from(auth_token.trim().to_owned())) + } else { + info!("no JWT token for authentication with safekeepers detected"); + None + } } Err(_) => { warn!("JWT token for authentication with safekeepers is not unicode"); @@ -399,6 +415,7 @@ async fn main() -> anyhow::Result<()> { ssl_cert_file: args.ssl_cert_file, ssl_cert_reload_period: args.ssl_cert_reload_period, ssl_ca_certs, + use_https_safekeeper_api: args.use_https_safekeeper_api, }); // initialize sentry if SENTRY_DSN is provided diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 003a75faa6..6e7c5d971d 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -31,6 +31,7 @@ pub async fn task_main_https( global_timelines: Arc, ) -> anyhow::Result<()> { let cert_resolver = ReloadingCertificateResolver::new( + "main", &conf.ssl_key_file, &conf.ssl_cert_file, conf.ssl_cert_reload_period, diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index b264fe8a1c..312456e5b2 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -16,9 +16,9 @@ use http_utils::{RequestExt, RouterBuilder}; use hyper::{Body, Request, Response, StatusCode}; use postgres_ffi::WAL_SEGMENT_SIZE; use safekeeper_api::models::{ - AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TermSwitchApiEntry, - TimelineCopyRequest, TimelineCreateRequest, TimelineDeleteResult, TimelineStatus, - TimelineTermBumpRequest, + AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TenantDeleteResult, + TermSwitchApiEntry, TimelineCopyRequest, TimelineCreateRequest, TimelineDeleteResult, + TimelineStatus, TimelineTermBumpRequest, }; use safekeeper_api::{ServerInfo, membership, models}; use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId}; @@ -83,13 +83,11 @@ async fn tenant_delete_handler(mut request: Request) -> Result>(), - ) + let response_body: TenantDeleteResult = delete_info + .iter() + .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp)) + .collect::>(); + json_response(StatusCode::OK, response_body) } async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { @@ -538,6 +536,7 @@ async fn record_safekeeper_info(mut request: Request) -> Result, + pub use_https_safekeeper_api: bool, } impl SafeKeeperConf { @@ -170,6 +171,7 @@ impl SafeKeeperConf { ssl_cert_file: Utf8PathBuf::from(defaults::DEFAULT_SSL_CERT_FILE), ssl_cert_reload_period: Duration::from_secs(60), ssl_ca_certs: Vec::new(), + use_https_safekeeper_api: false, } } } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 7967acde3f..9975153f6c 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -94,10 +94,10 @@ impl WalReceivers { /// Get reference to locked slot contents. Slot must exist (registered /// earlier). - fn get_slot<'a>( - self: &'a Arc, + fn get_slot( + self: &Arc, id: WalReceiverId, - ) -> MappedMutexGuard<'a, WalReceiverState> { + ) -> MappedMutexGuard<'_, WalReceiverState> { MutexGuard::map(self.mutex.lock(), |locked| { locked.slots[id] .as_mut() diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index c2760792b8..25b40f5d2e 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -176,6 +176,7 @@ pub struct Donor { pub flush_lsn: Lsn, pub pg_connstr: String, pub http_connstr: String, + pub https_connstr: Option, } impl From<&PeerInfo> for Donor { @@ -186,6 +187,7 @@ impl From<&PeerInfo> for Donor { flush_lsn: p.flush_lsn, pg_connstr: p.pg_connstr.clone(), http_connstr: p.http_connstr.clone(), + https_connstr: p.https_connstr.clone(), } } } @@ -236,11 +238,33 @@ async fn recover( conf: &SafeKeeperConf, ) -> anyhow::Result { // Learn donor term switch history to figure out starting point. - let client = reqwest::Client::new(); + + let mut client = reqwest::Client::builder(); + for cert in &conf.ssl_ca_certs { + client = client.add_root_certificate(cert.clone()); + } + let client = client + .build() + .context("Failed to build http client for recover")?; + + let url = if conf.use_https_safekeeper_api { + if let Some(https_connstr) = donor.https_connstr.as_ref() { + format!("https://{https_connstr}") + } else { + anyhow::bail!( + "cannot recover from donor {}: \ + https is enabled, but https_connstr is not specified", + donor.sk_id + ); + } + } else { + format!("http://{}", donor.http_connstr) + }; + let timeline_info: TimelineStatus = client .get(format!( - "http://{}/v1/tenant/{}/timeline/{}", - donor.http_connstr, tli.ttid.tenant_id, tli.ttid.timeline_id + "{}/v1/tenant/{}/timeline/{}", + url, tli.ttid.tenant_id, tli.ttid.timeline_id )) .send() .await? diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index d3c841ec09..b7ba28f435 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -50,6 +50,7 @@ fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> Peer local_start_lsn: Lsn(sk_info.local_start_lsn), pg_connstr: sk_info.safekeeper_connstr.clone(), http_connstr: sk_info.http_connstr.clone(), + https_connstr: sk_info.https_connstr.clone(), ts, } } @@ -137,6 +138,7 @@ impl Drop for WriteGuardSharedState<'_> { /// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this /// case, SafeKeeper is not available (because WAL is not present on disk) and all /// operations can be done only with control file. +#[allow(clippy::large_enum_variant, reason = "TODO")] pub enum StateSK { Loaded(SafeKeeper), Offloaded(Box>), @@ -363,6 +365,7 @@ impl SharedState { .to_owned() .unwrap_or(conf.listen_pg_addr.clone()), http_connstr: conf.listen_http_addr.to_owned(), + https_connstr: conf.listen_https_addr.to_owned(), backup_lsn: self.sk.state().inmem.backup_lsn.0, local_start_lsn: self.sk.state().local_start_lsn.0, availability_zone: conf.availability_zone.clone(), @@ -699,7 +702,7 @@ impl Timeline { } /// Take a writing mutual exclusive lock on timeline shared_state. - pub async fn write_shared_state<'a>(self: &'a Arc) -> WriteGuardSharedState<'a> { + pub async fn write_shared_state(self: &Arc) -> WriteGuardSharedState<'_> { WriteGuardSharedState::new(self.clone(), self.mutex.write().await) } diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 06ccb32d03..84c636daf6 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -35,7 +35,7 @@ impl Manager { next_event: &Option, state: &StateSnapshot, ) -> bool { - let ready = self.backup_task.is_none() + self.backup_task.is_none() && self.recovery_task.is_none() && self.wal_removal_task.is_none() && self.partial_backup_task.is_none() @@ -61,8 +61,7 @@ impl Manager { .unwrap() .flush_lsn .segment_number(self.wal_seg_size) - == self.last_removed_segno + 1; - ready + == self.last_removed_segno + 1 } /// Evict the timeline to remote storage. Returns whether the eviction was successful. diff --git a/safekeeper/tests/misc_test.rs b/safekeeper/tests/misc_test.rs index 8e54d2bb86..3acf9f72c4 100644 --- a/safekeeper/tests/misc_test.rs +++ b/safekeeper/tests/misc_test.rs @@ -116,7 +116,7 @@ fn test_many_tx() -> anyhow::Result<()> { } None }) - .last() + .next_back() .unwrap(); let initdb_lsn = 21623024; diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 58913537aa..b3f088d31c 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -184,6 +184,7 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { ssl_cert_file: Utf8PathBuf::from(""), ssl_cert_reload_period: Duration::ZERO, ssl_ca_certs: Vec::new(), + use_https_safekeeper_api: false, }; let mut global = GlobalMap::new(disk, conf.clone())?; diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index 86f2dd9a6c..0fef6a58e0 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -141,6 +141,7 @@ async fn publish(client: Option, n_keys: u64) { peer_horizon_lsn: 5, safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(), http_connstr: "zenith-1-sk-1.local:7677".to_owned(), + https_connstr: Some("zenith-1-sk-1.local:7678".to_owned()), local_start_lsn: 0, availability_zone: None, standby_horizon: 0, diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto index a420fd9c66..3891685589 100644 --- a/storage_broker/proto/broker.proto +++ b/storage_broker/proto/broker.proto @@ -45,8 +45,10 @@ message SafekeeperTimelineInfo { uint64 standby_horizon = 14; // A connection string to use for WAL receiving. string safekeeper_connstr = 10; - // HTTP endpoint connection string + // HTTP endpoint connection string. string http_connstr = 13; + // HTTPS endpoint connection string. + optional string https_connstr = 15; // Availability zone of a safekeeper. optional string availability_zone = 11; } diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index cc33ec20ff..a7e0c986e6 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -96,6 +96,7 @@ enum Message { impl Message { /// Convert proto message to internal message. + #[allow(clippy::result_large_err, reason = "TODO")] pub fn from(proto_msg: TypedMessage) -> Result { match proto_msg.r#type() { MessageType::SafekeeperTimelineInfo => Ok(Message::SafekeeperTimelineInfo( @@ -127,6 +128,7 @@ impl Message { } /// Get the tenant_timeline_id from the message. + #[allow(clippy::result_large_err, reason = "TODO")] pub fn tenant_timeline_id(&self) -> Result, Status> { match self { Message::SafekeeperTimelineInfo(msg) => Ok(msg @@ -185,6 +187,7 @@ enum SubscriptionKey { impl SubscriptionKey { /// Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors). + #[allow(clippy::result_large_err, reason = "TODO")] pub fn from_proto_subscription_key(key: ProtoSubscriptionKey) -> Result { match key { ProtoSubscriptionKey::All(_) => Ok(SubscriptionKey::All), @@ -195,6 +198,7 @@ impl SubscriptionKey { } /// Parse from FilterTenantTimelineId + #[allow(clippy::result_large_err, reason = "TODO")] pub fn from_proto_filter_tenant_timeline_id( opt: Option<&FilterTenantTimelineId>, ) -> Result { @@ -385,6 +389,7 @@ impl Registry { } /// Send msg to relevant subscribers. + #[allow(clippy::result_large_err, reason = "TODO")] pub fn send_msg(&self, msg: &Message) -> Result<(), Status> { PROCESSED_MESSAGES_TOTAL.inc(); @@ -436,6 +441,7 @@ struct Publisher { impl Publisher { /// Send msg to relevant subscribers. + #[allow(clippy::result_large_err, reason = "TODO")] pub fn send_msg(&mut self, msg: &Message) -> Result<(), Status> { self.registry.send_msg(msg) } @@ -764,6 +770,7 @@ mod tests { peer_horizon_lsn: 5, safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(), http_connstr: "neon-1-sk-1.local:7677".to_owned(), + https_connstr: Some("neon-1-sk-1.local:7678".to_owned()), local_start_lsn: 0, availability_zone: None, standby_horizon: 0, diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index 55d411f607..7b36f5e948 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -79,6 +79,7 @@ impl BrokerClientChannel { } // parse variable length bytes from protobuf +#[allow(clippy::result_large_err, reason = "TODO")] pub fn parse_proto_ttid(proto_ttid: &ProtoTenantTimelineId) -> Result { let tenant_id = TenantId::from_slice(&proto_ttid.tenant_id) .map_err(|e| Status::new(Code::InvalidArgument, format!("malformed tenant_id: {}", e)))?; diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs index 7888b18aa7..7afc835675 100644 --- a/storage_controller/client/src/control_api.rs +++ b/storage_controller/client/src/control_api.rs @@ -10,13 +10,11 @@ pub struct Client { } impl Client { - pub fn new(base_url: Url, jwt_token: Option) -> Self { + pub fn new(http_client: reqwest::Client, base_url: Url, jwt_token: Option) -> Self { Self { base_url, jwt_token, - client: reqwest::ClientBuilder::new() - .build() - .expect("Failed to construct http client"), + client: http_client, } } diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 0da35d6545..57709302e1 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -4,6 +4,7 @@ use std::error::Error as _; use std::sync::Arc; use std::time::Duration; +use anyhow::Context; use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::local_env::LocalEnv; use futures::StreamExt; @@ -364,25 +365,28 @@ pub(crate) struct ShardUpdate<'a> { } impl ComputeHook { - pub(super) fn new(config: Config) -> Self { + pub(super) fn new(config: Config) -> anyhow::Result { let authorization_header = config .control_plane_jwt_token .clone() .map(|jwt| format!("Bearer {}", jwt)); - let client = reqwest::ClientBuilder::new() - .timeout(NOTIFY_REQUEST_TIMEOUT) + let mut client = reqwest::ClientBuilder::new().timeout(NOTIFY_REQUEST_TIMEOUT); + for cert in &config.ssl_ca_certs { + client = client.add_root_certificate(cert.clone()); + } + let client = client .build() - .expect("Failed to construct HTTP client"); + .context("Failed to build http client for compute hook")?; - Self { + Ok(Self { state: Default::default(), config, authorization_header, neon_local_lock: Default::default(), api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), client, - } + }) } /// For test environments: use neon_local's LocalEnv to update compute @@ -625,15 +629,13 @@ impl ComputeHook { }; let result = if !self.config.use_local_compute_notifications { - let compute_hook_url = if let Some(control_plane_url) = &self.config.control_plane_url { - Some(if control_plane_url.ends_with('/') { - format!("{control_plane_url}notify-attach") - } else { - format!("{control_plane_url}/notify-attach") - }) - } else { - self.config.compute_hook_url.clone() - }; + let compute_hook_url = + self.config + .control_plane_url + .as_ref() + .map(|control_plane_url| { + format!("{}/notify-attach", control_plane_url.trim_end_matches('/')) + }); // We validate this at startup let notify_url = compute_hook_url.as_ref().unwrap(); @@ -796,7 +798,7 @@ impl ComputeHook { #[cfg(test)] pub(crate) mod tests { - use pageserver_api::shard::{ShardCount, ShardNumber}; + use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber}; use utils::id::TenantId; use super::*; @@ -804,6 +806,7 @@ pub(crate) mod tests { #[test] fn tenant_updates() -> anyhow::Result<()> { let tenant_id = TenantId::generate(); + let stripe_size = DEFAULT_STRIPE_SIZE; let mut tenant_state = ComputeHookTenant::new( TenantShardId { tenant_id, @@ -844,7 +847,7 @@ pub(crate) mod tests { shard_count: ShardCount::new(2), shard_number: ShardNumber(1), }, - stripe_size: ShardStripeSize(32768), + stripe_size, preferred_az: None, node_id: NodeId(1), }); @@ -860,7 +863,7 @@ pub(crate) mod tests { shard_count: ShardCount::new(2), shard_number: ShardNumber(0), }, - stripe_size: ShardStripeSize(32768), + stripe_size, preferred_az: None, node_id: NodeId(1), }); @@ -870,7 +873,7 @@ pub(crate) mod tests { anyhow::bail!("Wrong send result"); }; assert_eq!(request.shards.len(), 2); - assert_eq!(request.stripe_size, Some(ShardStripeSize(32768))); + assert_eq!(request.stripe_size, Some(stripe_size)); // Simulate successful send *guard = Some(ComputeRemoteState { diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index 524225c14a..fe916aa36a 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -12,6 +12,7 @@ use safekeeper_api::models::SafekeeperUtilization; use safekeeper_client::mgmt_api; use thiserror::Error; use tokio_util::sync::CancellationToken; +use tracing::Instrument; use utils::id::NodeId; use utils::logging::SecretString; @@ -227,6 +228,7 @@ impl HeartBeat for HeartbeaterTask Some((*node_id, status)) } + .instrument(tracing::info_span!("heartbeat_ps", %node_id)) }); } @@ -253,7 +255,7 @@ impl HeartBeat for HeartbeaterTask PageserverState::WarmingUp { .. } => { warming_up += 1; } - PageserverState::Offline { .. } => offline += 1, + PageserverState::Offline => offline += 1, PageserverState::Available { .. } => {} } } @@ -369,6 +371,7 @@ impl HeartBeat for HeartbeaterTask for HeartbeaterTask offline += 1, + SafekeeperState::Offline => offline += 1, SafekeeperState::Available { .. } => {} } } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index a5e00e18e8..fb4530d0d2 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -22,11 +22,12 @@ use pageserver_api::controller_api::{ MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest, + TimelineImportRequest, }; use pageserver_api::models::{ - DetachBehavior, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, - TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest, - TimelineCreateRequest, + DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest, + TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, + TimelineArchivalConfigRequest, TimelineCreateRequest, }; use pageserver_api::shard::TenantShardId; use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; @@ -582,6 +583,32 @@ async fn handle_tenant_timeline_download_heatmap_layers( json_response(StatusCode::OK, ()) } +async fn handle_tenant_timeline_lsn_lease( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let lsn_lease_request = json_request::(&mut req).await?; + + service + .tenant_timeline_lsn_lease(tenant_id, timeline_id, lsn_lease_request.lsn) + .await?; + + json_response(StatusCode::OK, ()) +} + // For metric labels where we would like to include the approximate path, but exclude high-cardinality fields like query parameters // and tenant/timeline IDs. Since we are proxying to arbitrary paths, we don't have routing templates to // compare to, so we can just filter out our well known ID format with regexes. @@ -613,6 +640,15 @@ async fn handle_tenant_timeline_passthrough( return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path"))); }; + let method = match *req.method() { + hyper::Method::GET => reqwest::Method::GET, + hyper::Method::POST => reqwest::Method::POST, + hyper::Method::PUT => reqwest::Method::PUT, + hyper::Method::DELETE => reqwest::Method::DELETE, + hyper::Method::PATCH => reqwest::Method::PATCH, + _ => return Err(ApiError::BadRequest(anyhow::anyhow!("Unsupported method"))), + }; + tracing::info!( "Proxying request for tenant {} ({})", tenant_or_shard_id.tenant_id, @@ -660,7 +696,7 @@ async fn handle_tenant_timeline_passthrough( node.base_url(), service.get_config().pageserver_jwt_token.as_deref(), ); - let resp = client.get_raw(path).await.map_err(|e| + let resp = client.op_raw(method, path).await.map_err(|e| // We return 503 here because if we can't successfully send a request to the pageserver, // either we aren't available or the pageserver is unavailable. ApiError::ResourceUnavailable(format!("Error sending pageserver API request to {node}: {e}").into()))?; @@ -864,7 +900,7 @@ async fn handle_node_status(req: Request) -> Result, ApiErr let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; - let node_status = state.service.get_node(node_id).await?; + let node_status = state.service.get_node(node_id).await?.describe(); json_response(StatusCode::OK, node_status) } @@ -1200,8 +1236,18 @@ async fn handle_step_down(req: Request) -> Result, ApiError ForwardOutcome::NotForwarded(req) => req, }; - let state = get_state(&req); - json_response(StatusCode::OK, state.service.step_down().await) + // Spawn a background task: once we start stepping down, we must finish: if the client drops + // their request we should avoid stopping in some part-stepped-down state. + let handle = tokio::spawn(async move { + let state = get_state(&req); + state.service.step_down().await + }); + + let result = handle + .await + .map_err(|e| ApiError::InternalServerError(e.into()))?; + + json_response(StatusCode::OK, result) } async fn handle_tenant_drop(req: Request) -> Result, ApiError> { @@ -1241,6 +1287,37 @@ async fn handle_tenant_import(req: Request) -> Result, ApiE ) } +async fn handle_timeline_import(req: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let import_req = json_request::(&mut req).await?; + + let state = get_state(&req); + + if import_req.tenant_id != tenant_id || import_req.timeline_id != timeline_id { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "tenant id or timeline id mismatch: url={tenant_id}/{timeline_id}, body={}/{}", + import_req.tenant_id, + import_req.timeline_id + ))); + } + + json_response( + StatusCode::OK, + state.service.timeline_import(import_req).await?, + ) +} + async fn handle_tenants_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -1381,6 +1458,12 @@ async fn handle_upsert_safekeeper(mut req: Request) -> Result { return res; @@ -1692,9 +1775,9 @@ async fn maybe_forward(req: Request) -> ForwardOutcome { }; if *self_addr == leader_addr { - return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( - "Leader is stepped down instance" - )))); + return ForwardOutcome::Forwarded(Err(ApiError::ResourceUnavailable( + "Leader is stepped down instance".into(), + ))); } } @@ -1703,19 +1786,17 @@ async fn maybe_forward(req: Request) -> ForwardOutcome { // Use [`RECONCILE_TIMEOUT`] as the max amount of time a request should block for and // include some leeway to get the timeout for proxied requests. const PROXIED_REQUEST_TIMEOUT: Duration = Duration::from_secs(RECONCILE_TIMEOUT.as_secs() + 10); - let client = reqwest::ClientBuilder::new() - .timeout(PROXIED_REQUEST_TIMEOUT) - .build(); - let client = match client { - Ok(client) => client, - Err(err) => { - return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( - "Failed to build leader client for forwarding while in stepped down state: {err}" - )))); - } - }; - let request: reqwest::Request = match convert_request(req, &client, leader.address).await { + let client = state.service.get_http_client().clone(); + + let request: reqwest::Request = match convert_request( + req, + &client, + leader.address, + PROXIED_REQUEST_TIMEOUT, + ) + .await + { Ok(r) => r, Err(err) => { return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( @@ -1773,6 +1854,7 @@ async fn convert_request( req: hyper::Request, client: &reqwest::Client, to_address: String, + timeout: Duration, ) -> Result { use std::str::FromStr; @@ -1827,6 +1909,7 @@ async fn convert_request( .request(method, uri) .headers(headers) .body(body) + .timeout(timeout) .build() .map_err(|err| { ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) @@ -1908,6 +1991,16 @@ pub fn make_router( RequestName("debug_v1_tenant_locate"), ) }) + .post( + "/debug/v1/tenant/:tenant_id/timeline/:timeline_id/import", + |r| { + named_request_span( + r, + handle_timeline_import, + RequestName("debug_v1_timeline_import"), + ) + }, + ) .get("/debug/v1/scheduler", |r| { named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler")) }) @@ -2192,6 +2285,17 @@ pub fn make_router( ) }, ) + // LSN lease passthrough to all shards + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/lsn_lease", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_lsn_lease, + RequestName("v1_tenant_timeline_lsn_lease"), + ) + }, + ) // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( @@ -2210,6 +2314,17 @@ pub fn make_router( RequestName("v1_tenant_passthrough"), ) }) + // Tenant timeline mark_invisible passthrough to shard zero + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/mark_invisible", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_passthrough, + RequestName("v1_tenant_timeline_mark_invisible_passthrough"), + ) + }, + ) } #[cfg(test)] diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs index 5e1d6f3ec9..39c28d60a9 100644 --- a/storage_controller/src/leadership.rs +++ b/storage_controller/src/leadership.rs @@ -110,7 +110,20 @@ impl Leadership { ) -> Option { tracing::info!("Sending step down request to {leader:?}"); + let mut http_client = reqwest::Client::builder(); + for cert in &self.config.ssl_ca_certs { + http_client = http_client.add_root_certificate(cert.clone()); + } + let http_client = match http_client.build() { + Ok(http_client) => http_client, + Err(err) => { + tracing::error!("Failed to build client for leader step-down request: {err}"); + return None; + } + }; + let client = PeerClient::new( + http_client, Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"), self.config.peer_jwt_token.clone(), ); diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 1a7f9a2366..a924e5b6c5 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -86,10 +86,6 @@ struct Cli { #[arg(long)] peer_jwt_token: Option, - /// URL to control plane compute notification endpoint - #[arg(long)] - compute_hook_url: Option, - /// URL to control plane storage API prefix #[arg(long)] control_plane_url: Option, @@ -115,19 +111,17 @@ struct Cli { #[arg(long)] split_threshold: Option, - /// Maximum number of shards during autosplits. 0 disables autosplits. - // TODO: defaults to 8 for backwards compatibility, should default to 255. - #[arg(long, default_value = "8")] + /// Maximum number of shards during autosplits. 0 disables autosplits. Defaults + /// to 16 as a safety to avoid too many shards by accident. + #[arg(long, default_value = "16")] max_split_shards: u8, /// Size threshold for initial shard splits of unsharded tenants. 0 disables initial splits. - // TODO: defaults to 64 GB for backwards compatibility. Should default to None. - #[arg(long, default_value = "68719476736")] - initial_split_threshold: u64, + #[arg(long)] + initial_split_threshold: Option, - /// Number of target shards for initial splits. 0 or 1 disables initial splits. - // TODO: defaults to 8 for backwards compatibility. Should default to 2. - #[arg(long, default_value = "8")] + /// Number of target shards for initial splits. 0 or 1 disables initial splits. Defaults to 2. + #[arg(long, default_value = "2")] initial_split_shards: u8, /// Maximum number of normal-priority reconcilers that may run in parallel @@ -285,10 +279,8 @@ impl Secrets { fn load_secret(cli: &Option, env_name: &str) -> Option { if let Some(v) = cli { Some(v.clone()) - } else if let Ok(v) = std::env::var(env_name) { - Some(v) } else { - None + std::env::var(env_name).ok() } } } @@ -364,13 +356,11 @@ async fn async_main() -> anyhow::Result<()> { "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" ); } - StrictMode::Strict - if args.compute_hook_url.is_none() && args.control_plane_url.is_none() => - { + StrictMode::Strict if args.control_plane_url.is_none() => { // Production systems should always have a control plane URL set, to prevent falling // back to trying to use neon_local. anyhow::bail!( - "neither `--compute-hook-url` nor `--control-plane-url` are set: this is only permitted in `--dev` mode" + "`--control-plane-url` is not set: this is only permitted in `--dev` mode" ); } StrictMode::Strict if args.use_local_compute_notifications => { @@ -398,7 +388,6 @@ async fn async_main() -> anyhow::Result<()> { safekeeper_jwt_token: secrets.safekeeper_jwt_token, control_plane_jwt_token: secrets.control_plane_jwt_token, peer_jwt_token: secrets.peer_jwt_token, - compute_hook_url: args.compute_hook_url, control_plane_url: args.control_plane_url, max_offline_interval: args .max_offline_interval @@ -417,7 +406,7 @@ async fn async_main() -> anyhow::Result<()> { tenant_rate_limit: args.tenant_rate_limit, split_threshold: args.split_threshold, max_split_shards: args.max_split_shards, - initial_split_threshold: Some(args.initial_split_threshold), + initial_split_threshold: args.initial_split_threshold, initial_split_shards: args.initial_split_shards, neon_local_repo_dir: args.neon_local_repo_dir, max_secondary_lag_bytes: args.max_secondary_lag_bytes, @@ -476,6 +465,7 @@ async fn async_main() -> anyhow::Result<()> { let https_listener = tcp_listener::bind(https_addr)?; let resolver = ReloadingCertificateResolver::new( + "main", &args.ssl_key_file, &args.ssl_cert_file, *args.ssl_cert_reload_period, diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index ea390df726..5ce2fb65e4 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -44,6 +44,15 @@ pub(crate) struct StorageControllerMetricGroup { /// Size of the in-memory map of pageserver_nodes pub(crate) storage_controller_pageserver_nodes: measured::Gauge, + /// Count of how many pageserver nodes from in-memory map have https configured + pub(crate) storage_controller_https_pageserver_nodes: measured::Gauge, + + /// Size of the in-memory map of safekeeper_nodes + pub(crate) storage_controller_safekeeper_nodes: measured::Gauge, + + /// Count of how many safekeeper nodes from in-memory map have https configured + pub(crate) storage_controller_https_safekeeper_nodes: measured::Gauge, + /// Reconciler tasks completed, broken down by success/failure/cancelled pub(crate) storage_controller_reconcile_complete: measured::CounterVec, diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index f667514517..e180c49b43 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -89,6 +89,10 @@ impl Node { self.scheduling = scheduling } + pub(crate) fn has_https_port(&self) -> bool { + self.listen_https_port.is_some() + } + /// Does this registration request match `self`? This is used when deciding whether a registration /// request should be allowed to update an existing record with the same node ID. pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool { diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index c6c21107f1..d14fc35b39 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,6 +1,6 @@ use pageserver_api::models::detach_ancestor::AncestorDetached; use pageserver_api::models::{ - DetachBehavior, LocationConfig, LocationConfigListResponse, PageserverUtilization, + DetachBehavior, LocationConfig, LocationConfigListResponse, LsnLease, PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, TenantWaitLsnRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, @@ -10,6 +10,7 @@ use pageserver_client::BlockUnblock; use pageserver_client::mgmt_api::{Client, Result}; use reqwest::StatusCode; use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage /// controller to collect metrics in a non-intrusive manner. @@ -195,6 +196,22 @@ impl PageserverClient { ) } + pub(crate) async fn timeline_lease_lsn( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Result { + measured_request!( + "timeline_lease_lsn", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .timeline_init_lsn_lease(tenant_shard_id, timeline_id, lsn) + .await + ) + } + pub(crate) async fn tenant_shard_split( &self, tenant_shard_id: TenantShardId, diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs index f3f275dee0..604d1024ba 100644 --- a/storage_controller/src/peer_client.rs +++ b/storage_controller/src/peer_client.rs @@ -59,11 +59,11 @@ impl ResponseErrorMessageExt for reqwest::Response { pub(crate) struct GlobalObservedState(pub(crate) HashMap); impl PeerClient { - pub(crate) fn new(uri: Uri, jwt: Option) -> Self { + pub(crate) fn new(http_client: reqwest::Client, uri: Uri, jwt: Option) -> Self { Self { uri, jwt, - client: reqwest::Client::new(), + client: http_client, } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index c927b7c366..d25448718f 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -1369,6 +1369,65 @@ impl Persistence { Ok(timeline_from_db) } + /// Set `delete_at` for the given timeline + pub(crate) async fn timeline_set_deleted_at( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> DatabaseResult<()> { + use crate::schema::timelines; + + let deletion_time = chrono::Local::now().to_utc(); + self.with_measured_conn(DatabaseOperation::InsertTimeline, move |conn| { + Box::pin(async move { + let updated = diesel::update(timelines::table) + .filter(timelines::tenant_id.eq(tenant_id.to_string())) + .filter(timelines::timeline_id.eq(timeline_id.to_string())) + .set(timelines::deleted_at.eq(Some(deletion_time))) + .execute(conn) + .await?; + + match updated { + 0 => Ok(()), + 1 => Ok(()), + _ => Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + updated + ))), + } + }) + }) + .await + } + + /// Load timeline from db. Returns `None` if not present. + /// + /// Only works if `deleted_at` is set, so you should call [`Self::timeline_set_deleted_at`] before. + pub(crate) async fn delete_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> DatabaseResult<()> { + use crate::schema::timelines::dsl; + + let tenant_id = &tenant_id; + let timeline_id = &timeline_id; + self.with_measured_conn(DatabaseOperation::GetTimeline, move |conn| { + Box::pin(async move { + diesel::delete(dsl::timelines) + .filter(dsl::tenant_id.eq(&tenant_id.to_string())) + .filter(dsl::timeline_id.eq(&timeline_id.to_string())) + .filter(dsl::deleted_at.is_not_null()) + .execute(conn) + .await?; + Ok(()) + }) + }) + .await?; + + Ok(()) + } + /// Loads a list of all timelines from database. pub(crate) async fn list_timelines_for_tenant( &self, @@ -1465,22 +1524,39 @@ impl Persistence { /// Load pending operations from db. pub(crate) async fn list_pending_ops( &self, - filter_for_sk: Option, ) -> DatabaseResult> { use crate::schema::safekeeper_timeline_pending_ops::dsl; - const FILTER_VAL_1: i64 = 1; - const FILTER_VAL_2: i64 = 2; - let filter_opt = filter_for_sk.map(|id| id.0 as i64); let timeline_from_db = self + .with_measured_conn(DatabaseOperation::ListTimelineReconcile, move |conn| { + Box::pin(async move { + let from_db: Vec = + dsl::safekeeper_timeline_pending_ops.load(conn).await?; + Ok(from_db) + }) + }) + .await?; + + Ok(timeline_from_db) + } + /// List pending operations for a given timeline (including tenant-global ones) + pub(crate) async fn list_pending_ops_for_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> DatabaseResult> { + use crate::schema::safekeeper_timeline_pending_ops::dsl; + + let timelines_from_db = self .with_measured_conn(DatabaseOperation::ListTimelineReconcile, move |conn| { Box::pin(async move { let from_db: Vec = dsl::safekeeper_timeline_pending_ops + .filter(dsl::tenant_id.eq(tenant_id.to_string())) .filter( - dsl::sk_id - .eq(filter_opt.unwrap_or(FILTER_VAL_1)) - .and(dsl::sk_id.eq(filter_opt.unwrap_or(FILTER_VAL_2))), + dsl::timeline_id + .eq(timeline_id.to_string()) + .or(dsl::timeline_id.eq("")), ) .load(conn) .await?; @@ -1489,7 +1565,7 @@ impl Persistence { }) .await?; - Ok(timeline_from_db) + Ok(timelines_from_db) } /// Delete all pending ops for the given timeline. @@ -1974,7 +2050,7 @@ impl ToSql for LsnWrapper { } } -#[derive(Insertable, AsChangeset, Queryable, Selectable, Clone)] +#[derive(Insertable, AsChangeset, Clone)] #[diesel(table_name = crate::schema::timelines)] pub(crate) struct TimelinePersistence { pub(crate) tenant_id: String, diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 9f6f385dc9..b03a6dae04 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -686,6 +686,8 @@ impl Reconciler { .await?, ); + pausable_failpoint!("reconciler-live-migrate-post-generation-inc"); + let dest_conf = build_location_config( &self.shard, &self.config, @@ -760,7 +762,9 @@ impl Reconciler { Ok(()) } - async fn maybe_refresh_observed(&mut self) -> Result<(), ReconcileError> { + /// Returns true if the observed state of the attached location was refreshed + /// and false otherwise. + async fn maybe_refresh_observed(&mut self) -> Result { // If the attached node has uncertain state, read it from the pageserver before proceeding: this // is important to avoid spurious generation increments. // @@ -770,7 +774,7 @@ impl Reconciler { let Some(attached_node) = self.intent.attached.as_ref() else { // Nothing to do - return Ok(()); + return Ok(false); }; if matches!( @@ -815,7 +819,7 @@ impl Reconciler { } } - Ok(()) + Ok(true) } /// Reconciling a tenant makes API calls to pageservers until the observed state @@ -831,7 +835,7 @@ impl Reconciler { /// state where it still requires later reconciliation. pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> { // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it - self.maybe_refresh_observed().await?; + let refreshed = self.maybe_refresh_observed().await?; // Special case: live migration self.maybe_live_migrate().await?; @@ -855,8 +859,14 @@ impl Reconciler { ); match self.observed.locations.get(&node.get_id()) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { - // Nothing to do - tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") + if refreshed { + tracing::info!( + node_id=%node.get_id(), "Observed configuration correct after refresh. Notifying compute."); + self.compute_notify().await?; + } else { + // Nothing to do + tracing::info!(node_id=%node.get_id(), "Observed configuration already correct."); + } } observed => { // In all cases other than a matching observed configuration, we will diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs index 3b731acf7e..5a13ef750e 100644 --- a/storage_controller/src/safekeeper.rs +++ b/storage_controller/src/safekeeper.rs @@ -89,6 +89,9 @@ impl Safekeeper { pub(crate) fn availability(&self) -> SafekeeperState { self.availability.clone() } + pub(crate) fn has_https_port(&self) -> bool { + self.listen_https_port.is_some() + } /// Perform an operation (which is given a [`SafekeeperClient`]) with retries #[allow(clippy::too_many_arguments)] pub(crate) async fn with_client_retries( diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index 98e3f74071..988159af4a 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -101,7 +101,7 @@ impl SafekeeperClient { pub(crate) async fn delete_tenant( &self, tenant_id: TenantId, - ) -> Result { + ) -> Result { measured_request!( "delete_tenant", crate::metrics::Method::Delete, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index d3c8cad0bd..a021313474 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -12,7 +12,7 @@ use std::ops::{Deref, DerefMut}; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; -use std::time::{Duration, Instant}; +use std::time::{Duration, Instant, SystemTime}; use anyhow::Context; use context_iterator::TenantShardContextIterator; @@ -34,7 +34,7 @@ use pageserver_api::controller_api::{ TenantShardMigrateRequest, TenantShardMigrateResponse, }; use pageserver_api::models::{ - self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, + self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, PageserverUtilization, SecondaryProgress, ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, @@ -43,7 +43,7 @@ use pageserver_api::models::{ TimelineInfo, TopTenantShardItem, TopTenantShardsRequest, }; use pageserver_api::shard::{ - ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, + DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, }; use pageserver_api::upcall_api::{ ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse, @@ -60,7 +60,8 @@ use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; use utils::completion::Barrier; use utils::generation::Generation; use utils::id::{NodeId, TenantId, TimelineId}; -use utils::sync::gate::Gate; +use utils::lsn::Lsn; +use utils::sync::gate::{Gate, GateGuard}; use utils::{failpoint_support, pausable_failpoint}; use crate::background_node_operations::{ @@ -152,6 +153,7 @@ enum TenantOperations { TimelineGcBlockUnblock, DropDetached, DownloadHeatmapLayers, + TimelineLsnLease, } #[derive(Clone, strum_macros::Display)] @@ -355,18 +357,10 @@ pub struct Config { // This JWT token will be used to authenticate with other storage controller instances pub peer_jwt_token: Option, - /// Where the compute hook should send notifications of pageserver attachment locations - /// (this URL points to the control plane in prod). If this is None, the compute hook will - /// assume it is running in a test environment and try to update neon_local. - pub compute_hook_url: Option, - /// Prefix for storage API endpoints of the control plane. We use this prefix to compute /// URLs that we use to send pageserver and safekeeper attachment locations. /// If this is None, the compute hook will assume it is running in a test environment /// and try to invoke neon_local instead. - /// - /// For now, there is also `compute_hook_url` which allows configuration of the pageserver - /// specific endpoint, but it is in the process of being phased out. pub control_plane_url: Option, /// Grace period within which a pageserver does not respond to heartbeats, but is still @@ -592,6 +586,8 @@ struct TenantShardSplitAbort { new_stripe_size: Option, /// Until this abort op is complete, no other operations may be done on the tenant _tenant_lock: TracingExclusiveGuard, + /// The reconciler gate for the duration of the split operation, and any included abort. + _gate: GateGuard, } #[derive(thiserror::Error, Debug)] @@ -1458,7 +1454,7 @@ impl Service { // Retry until shutdown: we must keep this request object alive until it is properly // processed, as it holds a lock guard that prevents other operations trying to do things // to the tenant while it is in a weird part-split state. - while !self.cancel.is_cancelled() { + while !self.reconcilers_cancel.is_cancelled() { match self.abort_tenant_shard_split(&op).await { Ok(_) => break, Err(e) => { @@ -1471,9 +1467,12 @@ impl Service { // when we retry, so that the abort op will succeed. If the abort op is failing // for some other reason, we will keep retrying forever, or until a human notices // and does something about it (either fixing a pageserver or restarting the controller). - tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled()) - .await - .ok(); + tokio::time::timeout( + Duration::from_secs(5), + self.reconcilers_cancel.cancelled(), + ) + .await + .ok(); } } } @@ -1507,6 +1506,10 @@ impl Service { .metrics_group .storage_controller_pageserver_nodes .set(nodes.len() as i64); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_https_pageserver_nodes + .set(nodes.values().filter(|n| n.has_https_port()).count() as i64); tracing::info!("Loading safekeepers from database..."); let safekeepers = persistence @@ -1524,6 +1527,14 @@ impl Service { let safekeepers: HashMap = safekeepers.into_iter().map(|n| (n.get_id(), n)).collect(); tracing::info!("Loaded {} safekeepers from database.", safekeepers.len()); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_safekeeper_nodes + .set(safekeepers.len() as i64); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_https_safekeeper_nodes + .set(safekeepers.values().filter(|s| s.has_https_port()).count() as i64); tracing::info!("Loading shards from database..."); let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?; @@ -1709,7 +1720,7 @@ impl Service { ))), config: config.clone(), persistence, - compute_hook: Arc::new(ComputeHook::new(config.clone())), + compute_hook: Arc::new(ComputeHook::new(config.clone())?), result_tx, heartbeater_ps, heartbeater_sk, @@ -1833,6 +1844,7 @@ impl Service { }; if insert { + let config = attach_req.config.clone().unwrap_or_default(); let tsp = TenantShardPersistence { tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(), shard_number: attach_req.tenant_shard_id.shard_number.0 as i32, @@ -1841,7 +1853,7 @@ impl Service { generation: attach_req.generation_override.or(Some(0)), generation_pageserver: None, placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(), - config: serde_json::to_string(&TenantConfig::default()).unwrap(), + config: serde_json::to_string(&config).unwrap(), splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) .unwrap(), @@ -1864,16 +1876,16 @@ impl Service { Ok(()) => { tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id); - let mut locked = self.inner.write().unwrap(); - locked.tenants.insert( + let mut shard = TenantShard::new( attach_req.tenant_shard_id, - TenantShard::new( - attach_req.tenant_shard_id, - ShardIdentity::unsharded(), - PlacementPolicy::Attached(0), - None, - ), + ShardIdentity::unsharded(), + PlacementPolicy::Attached(0), + None, ); + shard.config = config; + + let mut locked = self.inner.write().unwrap(); + locked.tenants.insert(attach_req.tenant_shard_id, shard); tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id); } } @@ -1958,11 +1970,12 @@ impl Service { .set_attached(scheduler, attach_req.node_id); tracing::info!( - "attach_hook: tenant {} set generation {:?}, pageserver {}", + "attach_hook: tenant {} set generation {:?}, pageserver {}, config {:?}", attach_req.tenant_shard_id, tenant_shard.generation, // TODO: this is an odd number of 0xf's - attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) + attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)), + attach_req.config, ); // Trick the reconciler into not doing anything for this tenant: this helps @@ -2740,7 +2753,7 @@ impl Service { count: tenant_shard_id.shard_count, // We only import un-sharded or single-sharded tenants, so stripe // size can be made up arbitrarily here. - stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE, + stripe_size: DEFAULT_STRIPE_SIZE, }, placement_policy: Some(placement_policy), config: req.config.tenant_conf, @@ -3987,6 +4000,75 @@ impl Service { Ok(()) } + pub(crate) async fn tenant_timeline_lsn_lease( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Result { + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineLsnLease, + ) + .await; + + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + // If the request got an unsharded tenant id, then apply + // the operation to all shards. Otherwise, apply it to a specific shard. + let shards_range = TenantShardId::tenant_range(tenant_id); + + for (tenant_shard_id, shard) in locked.tenants.range(shards_range) { + if let Some(node_id) = shard.intent.get_attached() { + let node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + } + targets + }; + + let res = self + .tenant_for_shards_api( + targets, + |tenant_shard_id, client| async move { + client + .timeline_lease_lsn(tenant_shard_id, timeline_id, lsn) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + let mut valid_until = None; + for r in res { + match r { + Ok(lease) => { + if let Some(ref mut valid_until) = valid_until { + *valid_until = std::cmp::min(*valid_until, lease.valid_until); + } else { + valid_until = Some(lease.valid_until); + } + } + Err(e) => { + return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); + } + } + } + Ok(LsnLease { + valid_until: valid_until.unwrap_or_else(SystemTime::now), + }) + } + pub(crate) async fn tenant_timeline_download_heatmap_layers( &self, tenant_shard_id: TenantShardId, @@ -4827,7 +4909,7 @@ impl Service { 1, 10, Duration::from_secs(5), - &self.cancel, + &self.reconcilers_cancel, ) .await { @@ -5078,6 +5160,11 @@ impl Service { ) .await; + let _gate = self + .reconcilers_gate + .enter() + .map_err(|_| ApiError::ShuttingDown)?; + let new_shard_count = ShardCount::new(split_req.new_shard_count); let new_stripe_size = split_req.new_stripe_size; @@ -5105,6 +5192,7 @@ impl Service { new_shard_count, new_stripe_size, _tenant_lock, + _gate, }) // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it. .ok(); @@ -5444,7 +5532,10 @@ impl Service { "failpoint".to_string() ))); - failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel); + failpoint_support::sleep_millis_async!( + "shard-split-post-remote-sleep", + &self.reconcilers_cancel + ); tracing::info!( "Split {} into {}", @@ -5502,7 +5593,7 @@ impl Service { stripe_size, preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed), }, - &self.cancel, + &self.reconcilers_cancel, ) .await { @@ -5943,9 +6034,21 @@ impl Service { .max() .expect("We already validated >0 shards"); - // FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will - // only work if they were using the default stripe size. - let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE; + // Find the tenant's stripe size. This wasn't always persisted in the tenant manifest, so + // fall back to the original default stripe size of 32768 (256 MB) if it's not specified. + const ORIGINAL_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(32768); + let stripe_size = scan_result + .shards + .iter() + .find(|s| s.tenant_shard_id.shard_count == shard_count && s.generation == generation) + .expect("we validated >0 shards above") + .stripe_size + .unwrap_or_else(|| { + if shard_count.count() > 1 { + warn!("unknown stripe size, assuming {ORIGINAL_STRIPE_SIZE}"); + } + ORIGINAL_STRIPE_SIZE + }); let (response, waiters) = self .do_tenant_create(TenantCreateRequest { @@ -6171,6 +6274,10 @@ impl Service { .metrics_group .storage_controller_pageserver_nodes .set(locked.nodes.len() as i64); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_https_pageserver_nodes + .set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64); locked.scheduler.node_remove(node_id); @@ -6262,6 +6369,10 @@ impl Service { .metrics_group .storage_controller_pageserver_nodes .set(nodes.len() as i64); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_https_pageserver_nodes + .set(nodes.values().filter(|n| n.has_https_port()).count() as i64); } } @@ -6486,6 +6597,10 @@ impl Service { .metrics_group .storage_controller_pageserver_nodes .set(locked.nodes.len() as i64); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_https_pageserver_nodes + .set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64); match registration_status { RegistrationStatus::New => { @@ -7199,7 +7314,7 @@ impl Service { } // Eventual consistency: if an earlier reconcile job failed, and the shard is still - // dirty, spawn another rone + // dirty, spawn another one if self .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal) .is_some() @@ -7758,7 +7873,7 @@ impl Service { // old, persisted stripe size. let new_stripe_size = match candidate.id.shard_count.count() { 0 => panic!("invalid shard count 0"), - 1 => Some(ShardParameters::DEFAULT_STRIPE_SIZE), + 1 => Some(DEFAULT_STRIPE_SIZE), 2.. => None, }; @@ -8563,9 +8678,24 @@ impl Service { failpoint_support::sleep_millis_async!("sleep-on-step-down-handling"); self.inner.write().unwrap().step_down(); - // TODO: would it make sense to have a time-out for this? - self.stop_reconciliations(StopReconciliationsReason::SteppingDown) - .await; + + // Wait for reconciliations to stop, or terminate this process if they + // fail to stop in time (this indicates a bug in shutdown) + tokio::select! { + _ = self.stop_reconciliations(StopReconciliationsReason::SteppingDown) => { + tracing::info!("Reconciliations stopped, proceeding with step down"); + } + _ = async { + failpoint_support::sleep_millis_async!("step-down-delay-timeout"); + tokio::time::sleep(Duration::from_secs(10)).await + } => { + tracing::warn!("Step down timed out while waiting for reconciliation gate, terminating process"); + + // The caller may proceed to act as leader when it sees this request fail: reduce the chance + // of a split-brain situation by terminating this controller instead of leaving it up in a partially-shut-down state. + std::process::exit(1); + } + } let mut global_observed = GlobalObservedState::default(); let locked = self.inner.read().unwrap(); diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index a0419e0205..9c7a9e3798 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -4,7 +4,7 @@ use std::time::Duration; use pageserver_api::controller_api::ShardSchedulingPolicy; use rand::seq::SliceRandom; -use rand::thread_rng; +use rand::{Rng, thread_rng}; use tokio_util::sync::CancellationToken; use utils::id::NodeId; use utils::shard::TenantShardId; @@ -64,17 +64,22 @@ impl ChaosInjector { let mut interval = tokio::time::interval(self.interval); #[derive(Debug)] enum ChaosEvent { - ShuffleTenant, - ForceKill, + MigrationsToSecondary, + ForceKillController, + GracefulMigrationsAnywhere, } loop { let cron_interval = self.get_cron_interval_sleep_future(); let chaos_type = tokio::select! { _ = interval.tick() => { - ChaosEvent::ShuffleTenant + if thread_rng().gen_bool(0.5) { + ChaosEvent::MigrationsToSecondary + } else { + ChaosEvent::GracefulMigrationsAnywhere + } } Some(_) = maybe_sleep(cron_interval) => { - ChaosEvent::ForceKill + ChaosEvent::ForceKillController } _ = cancel.cancelled() => { tracing::info!("Shutting down"); @@ -83,16 +88,29 @@ impl ChaosInjector { }; tracing::info!("Chaos iteration: {chaos_type:?}..."); match chaos_type { - ChaosEvent::ShuffleTenant => { - self.inject_chaos().await; + ChaosEvent::MigrationsToSecondary => { + self.inject_migrations_to_secondary(); } - ChaosEvent::ForceKill => { + ChaosEvent::GracefulMigrationsAnywhere => { + self.inject_graceful_migrations_anywhere(); + } + ChaosEvent::ForceKillController => { self.force_kill().await; } } } } + fn is_shard_eligible_for_chaos(&self, shard: &TenantShard) -> bool { + // - Skip non-active scheduling policies, so that a shard with a policy like Pause can + // be pinned without being disrupted by us. + // - Skip shards doing a graceful migration already, so that we allow these to run to + // completion rather than only exercising the first part and then cancelling with + // some other chaos. + !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) + && shard.get_preferred_node().is_none() + } + /// If a shard has a secondary and attached location, then re-assign the secondary to be /// attached and the attached to be secondary. /// @@ -108,13 +126,7 @@ impl ChaosInjector { .get_mut(&tenant_shard_id) .expect("Held lock between choosing ID and this get"); - if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) { - // Skip non-active scheduling policies, so that a shard with a policy like Pause can - // be pinned without being disrupted by us. - tracing::info!( - "Skipping shard {tenant_shard_id}: scheduling policy is {:?}", - shard.get_scheduling_policy() - ); + if !self.is_shard_eligible_for_chaos(shard) { return; } @@ -152,7 +164,77 @@ impl ChaosInjector { std::process::exit(1); } - async fn inject_chaos(&mut self) { + // Unlike [`Self::inject_migrations_to_secondary`], this function will not only cut over to secondary, it + // will migrate a tenant to a random node in its home AZ using a graceful migration of the same type + // that my be initiated by an API caller using prewarm=true. + // + // This is a much more expensive operation in terms of I/O and time, as we will fully warm up + // some new location in order to migrate the tenant there. For that reason we do far fewer of these. + fn inject_graceful_migrations_anywhere(&mut self) { + let batch_size = 1; + let mut inner = self.service.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = inner.parts_mut(); + + let mut candidates = tenants + .values_mut() + .filter(|shard| self.is_shard_eligible_for_chaos(shard)) + .collect::>(); + + tracing::info!( + "Injecting chaos: found {} candidates for graceful migrations anywhere", + candidates.len() + ); + + let mut victims: Vec<&mut TenantShard> = Vec::new(); + + // Pick our victims: use a hand-rolled loop rather than choose_multiple() because we want + // to take the mutable refs from our candidates rather than ref'ing them. + while !candidates.is_empty() && victims.len() < batch_size { + let i = thread_rng().gen_range(0..candidates.len()); + victims.push(candidates.swap_remove(i)); + } + + for victim in victims.into_iter() { + // Find a node in the same AZ as the shard, or if the shard has no AZ preference, which + // is not where they are currently attached. + let candidate_nodes = nodes + .values() + .filter(|node| { + if let Some(preferred_az) = victim.preferred_az() { + node.get_availability_zone_id() == preferred_az + } else if let Some(attached) = *victim.intent.get_attached() { + node.get_id() != attached + } else { + true + } + }) + .collect::>(); + + let Some(victim_node) = candidate_nodes.choose(&mut thread_rng()) else { + // This can happen if e.g. we are in a small region with only one pageserver per AZ. + tracing::info!( + "no candidate nodes found for migrating shard {tenant_shard_id} within its home AZ", + tenant_shard_id = victim.tenant_shard_id + ); + continue; + }; + + // This doesn't change intent immediately: next iteration of Service::optimize_all should do that. We avoid + // doing it here because applying optimizations requires dropping lock to do some async work to check the optimisation + // is valid given remote state, and it would be a shame to duplicate that dance here. + tracing::info!( + "Injecting chaos: migrate {} to {}", + victim.tenant_shard_id, + victim_node + ); + victim.set_preferred_node(Some(victim_node.get_id())); + } + } + + /// Migrations of attached locations to their secondary location. This exercises reconciliation in general, + /// live migration in particular, and the pageserver code for cleanly shutting down and starting up tenants + /// during such migrations. + fn inject_migrations_to_secondary(&mut self) { // Pick some shards to interfere with let batch_size = 128; let mut inner = self.service.inner.write().unwrap(); diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index a60aa6ca53..76e3162617 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -35,6 +35,10 @@ impl SafekeeperReconcilers { service: &Arc, reqs: Vec, ) { + tracing::info!( + "Scheduling {} pending safekeeper ops loaded from db", + reqs.len() + ); for req in reqs { self.schedule_request(service, req); } @@ -74,7 +78,7 @@ pub(crate) async fn load_schedule_requests( service: &Arc, safekeepers: &HashMap, ) -> anyhow::Result> { - let pending_ops = service.persistence.list_pending_ops(None).await?; + let pending_ops = service.persistence.list_pending_ops().await?; let mut res = Vec::with_capacity(pending_ops.len()); for op_persist in pending_ops { let node_id = NodeId(op_persist.sk_id as u64); @@ -160,9 +164,8 @@ pub(crate) struct ScheduleRequest { } struct ReconcilerHandle { - tx: UnboundedSender<(ScheduleRequest, Arc)>, - #[allow(clippy::type_complexity)] - ongoing_tokens: Arc), Arc>>, + tx: UnboundedSender<(ScheduleRequest, CancellationToken)>, + ongoing_tokens: Arc), CancellationToken>>, cancel: CancellationToken, } @@ -172,13 +175,13 @@ impl ReconcilerHandle { &self, tenant_id: TenantId, timeline_id: Option, - ) -> Arc { + ) -> CancellationToken { let entry = self.ongoing_tokens.entry((tenant_id, timeline_id)); if let Entry::Occupied(entry) = &entry { let cancel: &CancellationToken = entry.get(); cancel.cancel(); } - entry.insert(Arc::new(self.cancel.child_token())).clone() + entry.insert(self.cancel.child_token()).clone() } /// Cancel an ongoing reconciliation fn cancel_reconciliation(&self, tenant_id: TenantId, timeline_id: Option) { @@ -197,7 +200,7 @@ impl ReconcilerHandle { pub(crate) struct SafekeeperReconciler { service: Arc, - rx: UnboundedReceiver<(ScheduleRequest, Arc)>, + rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>, cancel: CancellationToken, } @@ -233,17 +236,19 @@ impl SafekeeperReconciler { let kind = req.kind; let tenant_id = req.tenant_id; let timeline_id = req.timeline_id; + let node_id = req.safekeeper.skp.id; self.reconcile_one(req, req_cancel) .instrument(tracing::info_span!( "reconcile_one", ?kind, %tenant_id, - ?timeline_id + ?timeline_id, + %node_id, )) .await; } } - async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: Arc) { + async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) { let req_host = req.safekeeper.skp.host.clone(); match req.kind { SafekeeperTimelineOpKind::Pull => { @@ -300,36 +305,96 @@ impl SafekeeperReconciler { SafekeeperTimelineOpKind::Delete => { let tenant_id = req.tenant_id; if let Some(timeline_id) = req.timeline_id { - self.reconcile_inner( + let deleted = self.reconcile_inner( req, async |client| client.delete_timeline(tenant_id, timeline_id).await, |_resp| { - tracing::info!("deleted timeline from {req_host}"); + tracing::info!(%tenant_id, %timeline_id, "deleted timeline from {req_host}"); }, req_cancel, ) .await; + if deleted { + self.delete_timeline_from_db(tenant_id, timeline_id).await; + } } else { - self.reconcile_inner( - req, - async |client| client.delete_tenant(tenant_id).await, - |_resp| { - tracing::info!("deleted tenant from {req_host}"); - }, - req_cancel, - ) - .await; + let deleted = self + .reconcile_inner( + req, + async |client| client.delete_tenant(tenant_id).await, + |_resp| { + tracing::info!(%tenant_id, "deleted tenant from {req_host}"); + }, + req_cancel, + ) + .await; + if deleted { + self.delete_tenant_timelines_from_db(tenant_id).await; + } } } } } + async fn delete_timeline_from_db(&self, tenant_id: TenantId, timeline_id: TimelineId) { + match self + .service + .persistence + .list_pending_ops_for_timeline(tenant_id, timeline_id) + .await + { + Ok(list) => { + if !list.is_empty() { + tracing::info!(%tenant_id, %timeline_id, "not deleting timeline from db as there is {} open reconciles", list.len()); + return; + } + } + Err(e) => { + tracing::warn!(%tenant_id, %timeline_id, "couldn't query pending ops: {e}"); + return; + } + } + tracing::info!(%tenant_id, %timeline_id, "deleting timeline from db after all reconciles succeeded"); + // In theory we could crash right after deleting the op from the db and right before reaching this, + // but then we'll boot up with a timeline that has deleted_at set, so hopefully we'll issue deletion ops for it again. + if let Err(err) = self + .service + .persistence + .delete_timeline(tenant_id, timeline_id) + .await + { + tracing::warn!(%tenant_id, %timeline_id, "couldn't delete timeline from db: {err}"); + } + } + async fn delete_tenant_timelines_from_db(&self, tenant_id: TenantId) { + let timeline_list = match self + .service + .persistence + .list_timelines_for_tenant(tenant_id) + .await + { + Ok(timeline_list) => timeline_list, + Err(e) => { + tracing::warn!(%tenant_id, "couldn't query timelines: {e}"); + return; + } + }; + for timeline in timeline_list { + let Ok(timeline_id) = TimelineId::from_str(&timeline.timeline_id) else { + tracing::warn!("Invalid timeline ID in database {}", timeline.timeline_id); + continue; + }; + self.delete_timeline_from_db(tenant_id, timeline_id).await; + } + } + /// Returns whether the reconciliation happened successfully async fn reconcile_inner( &self, req: ScheduleRequest, closure: impl Fn(SafekeeperClient) -> F, log_success: impl FnOnce(T) -> U, - req_cancel: Arc, - ) where + req_cancel: CancellationToken, + ) -> bool + where F: Future>, { let jwt = self @@ -373,11 +438,11 @@ impl SafekeeperReconciler { req.safekeeper.skp.host ); } - return; + return true; } Err(mgmt_api::Error::Cancelled) => { // On cancellation, the code that issued it will take care of removing db entries (if needed) - return; + return false; } Err(e) => { tracing::info!( diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 557c684f6b..a23b9a4a02 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -1,23 +1,27 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use super::safekeeper_reconciler::ScheduleRequest; use crate::heartbeater::SafekeeperState; +use crate::metrics; use crate::persistence::{ DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence, }; use crate::safekeeper::Safekeeper; use anyhow::Context; use http_utils::error::ApiError; -use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; +use pageserver_api::controller_api::{ + SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest, +}; use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo}; use safekeeper_api::membership::{MemberSet, SafekeeperId}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use utils::id::{NodeId, TenantId, TimelineId}; use utils::logging::SecretString; +use utils::lsn::Lsn; use super::Service; @@ -297,6 +301,31 @@ impl Service { timeline_id, }) } + + /// Directly insert the timeline into the database without reconciling it with safekeepers. + /// + /// Useful if the timeline already exists on the specified safekeepers, + /// but we want to make it storage controller managed. + pub(crate) async fn timeline_import(&self, req: TimelineImportRequest) -> Result<(), ApiError> { + let persistence = TimelinePersistence { + tenant_id: req.tenant_id.to_string(), + timeline_id: req.timeline_id.to_string(), + start_lsn: Lsn::INVALID.into(), + generation: 1, + sk_set: req.sk_set.iter().map(|sk_id| sk_id.0 as i64).collect(), + new_sk_set: None, + cplane_notified_generation: 1, + deleted_at: None, + }; + let inserted = self.persistence.insert_timeline(persistence).await?; + if inserted { + tracing::info!("imported timeline into db"); + } else { + tracing::info!("didn't import timeline into db, as it is already present in db"); + } + Ok(()) + } + /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler. pub(super) async fn tenant_timeline_delete_safekeepers( self: &Arc, @@ -313,25 +342,32 @@ impl Service { ); return Ok(()); }; + self.persistence + .timeline_set_deleted_at(tenant_id, timeline_id) + .await?; let all_sks = tl .new_sk_set .iter() - .flat_map(|sks| { - sks.iter() - .map(|sk| (*sk, SafekeeperTimelineOpKind::Exclude)) - }) - .chain( - tl.sk_set - .iter() - .map(|v| (*v, SafekeeperTimelineOpKind::Delete)), - ) - .collect::>(); + .flatten() + .chain(tl.sk_set.iter()) + .collect::>(); // Schedule reconciliations + for &sk_id in all_sks.iter() { + let pending_op = TimelinePendingOpPersistence { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + generation: tl.generation, + op_kind: SafekeeperTimelineOpKind::Delete, + sk_id: *sk_id, + }; + tracing::info!("writing pending op for sk id {sk_id}"); + self.persistence.insert_pending_op(pending_op).await?; + } { let mut locked = self.inner.write().unwrap(); - for (sk_id, kind) in all_sks { - let sk_id = NodeId(sk_id as u64); + for sk_id in all_sks { + let sk_id = NodeId(*sk_id as u64); let Some(sk) = locked.safekeepers.get(&sk_id) else { return Err(ApiError::InternalServerError(anyhow::anyhow!( "Couldn't find safekeeper with id {sk_id}" @@ -345,7 +381,7 @@ impl Service { tenant_id, timeline_id: Some(timeline_id), generation: tl.generation as u32, - kind, + kind: SafekeeperTimelineOpKind::Delete, }; locked.safekeeper_reconcilers.schedule_request(self, req); } @@ -379,32 +415,50 @@ impl Service { }) .collect::, ApiError>>()?; - // Remove pending ops from db. + // Remove pending ops from db, and set `deleted_at`. // We cancel them in a later iteration once we hold the state lock. for (timeline_id, _timeline) in timeline_list.iter() { self.persistence .remove_pending_ops_for_timeline(tenant_id, Some(*timeline_id)) .await?; + self.persistence + .timeline_set_deleted_at(tenant_id, *timeline_id) + .await?; } - let mut locked = self.inner.write().unwrap(); - // The list of safekeepers that have any of the timelines let mut sk_list = HashSet::new(); // List all pending ops for all timelines, cancel them - for (timeline_id, timeline) in timeline_list.iter() { + for (_timeline_id, timeline) in timeline_list.iter() { let sk_iter = timeline .sk_set .iter() .chain(timeline.new_sk_set.iter().flatten()) .map(|id| NodeId(*id as u64)); - for sk_id in sk_iter.clone() { + sk_list.extend(sk_iter); + } + + for &sk_id in sk_list.iter() { + let pending_op = TimelinePendingOpPersistence { + tenant_id: tenant_id.to_string(), + timeline_id: String::new(), + generation: i32::MAX, + op_kind: SafekeeperTimelineOpKind::Delete, + sk_id: sk_id.0 as i64, + }; + tracing::info!("writing pending op for sk id {sk_id}"); + self.persistence.insert_pending_op(pending_op).await?; + } + + let mut locked = self.inner.write().unwrap(); + + for (timeline_id, _timeline) in timeline_list.iter() { + for sk_id in sk_list.iter() { locked .safekeeper_reconcilers - .cancel_reconciles_for_timeline(sk_id, tenant_id, Some(*timeline_id)); + .cancel_reconciles_for_timeline(*sk_id, tenant_id, Some(*timeline_id)); } - sk_list.extend(sk_iter); } // unwrap is safe: we return above for an empty timeline list @@ -565,6 +619,20 @@ impl Service { } } locked.safekeepers = Arc::new(safekeepers); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_safekeeper_nodes + .set(locked.safekeepers.len() as i64); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_https_safekeeper_nodes + .set( + locked + .safekeepers + .values() + .filter(|s| s.has_https_port()) + .count() as i64, + ); } Ok(()) } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index f6b748844a..3a75e96cb2 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -622,7 +622,7 @@ impl TenantShard { .collect::>(); attached_locs.sort_by_key(|i| i.1); - if let Some((node_id, _gen)) = attached_locs.into_iter().last() { + if let Some((node_id, _gen)) = attached_locs.into_iter().next_back() { self.intent.set_attached(scheduler, Some(*node_id)); } @@ -2000,7 +2000,7 @@ pub(crate) mod tests { use std::rc::Rc; use pageserver_api::controller_api::NodeAvailability; - use pageserver_api::shard::{ShardCount, ShardNumber}; + use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber}; use rand::SeedableRng; use rand::rngs::StdRng; use utils::id::TenantId; @@ -2012,6 +2012,7 @@ pub(crate) mod tests { let tenant_id = TenantId::generate(); let shard_number = ShardNumber(0); let shard_count = ShardCount::new(1); + let stripe_size = DEFAULT_STRIPE_SIZE; let tenant_shard_id = TenantShardId { tenant_id, @@ -2020,12 +2021,7 @@ pub(crate) mod tests { }; TenantShard::new( tenant_shard_id, - ShardIdentity::new( - shard_number, - shard_count, - pageserver_api::shard::ShardStripeSize(32768), - ) - .unwrap(), + ShardIdentity::new(shard_number, shard_count, stripe_size).unwrap(), policy, None, ) @@ -2045,6 +2041,7 @@ pub(crate) mod tests { shard_count: ShardCount, preferred_az: Option, ) -> Vec { + let stripe_size = DEFAULT_STRIPE_SIZE; (0..shard_count.count()) .map(|i| { let shard_number = ShardNumber(i); @@ -2056,12 +2053,7 @@ pub(crate) mod tests { }; TenantShard::new( tenant_shard_id, - ShardIdentity::new( - shard_number, - shard_count, - pageserver_api::shard::ShardStripeSize(32768), - ) - .unwrap(), + ShardIdentity::new(shard_number, shard_count, stripe_size).unwrap(), policy.clone(), preferred_az.clone(), ) diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs index efb05fb55e..a4ca68d378 100644 --- a/storage_scrubber/src/find_large_objects.rs +++ b/storage_scrubber/src/find_large_objects.rs @@ -18,7 +18,7 @@ enum LargeObjectKind { impl LargeObjectKind { fn from_key(key: &str) -> Self { - let fname = key.split('/').last().unwrap(); + let fname = key.split('/').next_back().unwrap(); let Ok((layer_name, _generation)) = parse_layer_object_name(fname) else { return LargeObjectKind::Other; diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 34e43fcc0b..071f0b9756 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -295,8 +295,8 @@ pub struct ControllerClientConfig { } impl ControllerClientConfig { - pub fn build_client(self) -> control_api::Client { - control_api::Client::new(self.controller_api, Some(self.controller_jwt)) + pub fn build_client(self, http_client: reqwest::Client) -> control_api::Client { + control_api::Client::new(http_client, self.controller_api, Some(self.controller_jwt)) } } diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index fb2ab02565..4823c43e10 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -3,7 +3,7 @@ use camino::Utf8PathBuf; use clap::{Parser, Subcommand}; use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse}; use pageserver_api::shard::TenantShardId; -use reqwest::{Method, Url}; +use reqwest::{Certificate, Method, Url}; use storage_controller_client::control_api; use storage_scrubber::garbage::{PurgeMode, find_garbage, purge_garbage}; use storage_scrubber::pageserver_physical_gc::{GcMode, pageserver_physical_gc}; @@ -41,6 +41,10 @@ struct Cli { /// If set to true, the scrubber will exit with error code on fatal error. #[arg(long, default_value_t = false)] exit_code: bool, + + /// Trusted root CA certificates to use in https APIs. + #[arg(long)] + ssl_ca_file: Option, } #[derive(Subcommand, Debug)] @@ -146,13 +150,28 @@ async fn main() -> anyhow::Result<()> { tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG); + let ssl_ca_certs = match cli.ssl_ca_file.as_ref() { + Some(ssl_ca_file) => { + tracing::info!("Using ssl root CA file: {ssl_ca_file:?}"); + let buf = tokio::fs::read(ssl_ca_file).await?; + Certificate::from_pem_bundle(&buf)? + } + None => Vec::new(), + }; + + let mut http_client = reqwest::Client::builder(); + for cert in ssl_ca_certs { + http_client = http_client.add_root_certificate(cert); + } + let http_client = http_client.build()?; + let controller_client = cli.controller_api.map(|controller_api| { ControllerClientConfig { controller_api, // Default to no key: this is a convenience when working in a development environment controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()), } - .build_client() + .build_client(http_client) }); match cli.command { diff --git a/test_runner/cloud_regress/README.md b/test_runner/cloud_regress/README.md index 9c460e2764..f341f3c818 100644 --- a/test_runner/cloud_regress/README.md +++ b/test_runner/cloud_regress/README.md @@ -3,19 +3,35 @@ * Create a Neon project on staging. * Grant the superuser privileges to the DB user. * (Optional) create a branch for testing -* Configure the endpoint by updating the control-plane database with the following settings: +* Add the following settings to the `pg_settings` section of the default endpoint configuration for the project using the admin interface: * `Timeone`: `America/Los_Angeles` * `DateStyle`: `Postgres,MDY` * `compute_query_id`: `off` +* Add the following section to the project configuration: +```json +"preload_libraries": { + "use_defaults": false, + "enabled_libraries": [] + } +``` * Checkout the actual `Neon` sources * Patch the sql and expected files for the specific PostgreSQL version, e.g. for v17: ```bash $ cd vendor/postgres-v17 $ patch -p1 <../../compute/patches/cloud_regress_pg17.patch ``` +* Set the environment variables (please modify according your configuration): +```bash +$ export DEFAULT_PG_VERSION=17 +$ export BUILD_TYPE=release +``` +* Build the Neon binaries see [README.md](../../README.md) * Set the environment variable `BENCHMARK_CONNSTR` to the connection URI of your project. -* Set the environment variable `PG_VERSION` to the version of your project. +* Update poetry, run +```bash +$ scripts/pysync +``` * Run ```bash -$ pytest -m remote_cluster -k cloud_regress +$ scripts/pytest -m remote_cluster -k cloud_regress ``` \ No newline at end of file diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 9b28246f58..4073ebc3b9 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -5,6 +5,8 @@ import urllib.parse import requests from requests.adapters import HTTPAdapter +from fixtures.log_helper import log + class EndpointHttpClient(requests.Session): def __init__( @@ -51,6 +53,7 @@ class EndpointHttpClient(requests.Session): def metrics(self) -> str: res = self.get(f"http://localhost:{self.external_port}/metrics") res.raise_for_status() + log.debug("raw compute metrics: %s", res.text) return res.text # Current compute status. diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 106a588711..879808b7ba 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections import defaultdict -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal from prometheus_client.parser import text_string_to_metric_families @@ -46,14 +46,26 @@ class MetricsGetter: def get_metrics(self) -> Metrics: raise NotImplementedError() - def get_metric_value(self, name: str, filter: dict[str, str] | None = None) -> float | None: + def get_metric_value( + self, + name: str, + filter: dict[str, str] | None = None, + aggregate: Literal["sum"] | None = None, + ) -> float | None: metrics = self.get_metrics() results = metrics.query_all(name, filter=filter) if not results: log.info(f'could not find metric "{name}"') return None - assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}" - return results[0].value + if aggregate is None: + assert len(results) == 1, ( + f"metric {name} with given filters is not unique, got: {results}" + ) + return results[0].value + elif aggregate == "sum": + return sum(sample.value for sample in results) + else: + raise RuntimeError(f"unknown aggregate function {aggregate}") def get_metrics_values( self, names: list[str], filter: dict[str, str] | None = None, absence_ok: bool = False @@ -132,7 +144,7 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = ( *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], *histogram("pageserver_smgr_query_seconds_global"), *histogram("pageserver_wait_lsn_seconds"), - *histogram("pageserver_remote_operation_seconds"), + *histogram("pageserver_remote_timeline_client_seconds_global"), *histogram("pageserver_io_operations_seconds"), "pageserver_smgr_query_started_global_count_total", "pageserver_tenant_states_count", @@ -143,6 +155,7 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = ( counter("pageserver_tenant_throttling_wait_usecs_sum_global"), counter("pageserver_tenant_throttling_count_global"), *histogram("pageserver_tokio_epoll_uring_slots_submission_queue_depth"), + *histogram("pageserver_wait_ondemand_download_seconds_global"), ) PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( @@ -180,6 +193,8 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( counter("pageserver_wait_lsn_in_progress_micros"), counter("pageserver_wait_lsn_started_count"), counter("pageserver_wait_lsn_finished_count"), + counter("pageserver_wait_ondemand_download_seconds_sum"), + counter("pageserver_page_service_batch_break_reason"), *histogram("pageserver_page_service_batch_size"), *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"), *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index d555ee2989..5f5626fb98 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -417,6 +417,19 @@ class NeonLocalCli(AbstractNeonCli): cmd.append(f"--instance-id={instance_id}") return self.raw_cli(cmd) + def object_storage_start(self, timeout_in_seconds: int | None = None): + cmd = ["object-storage", "start"] + if timeout_in_seconds is not None: + cmd.append(f"--start-timeout={timeout_in_seconds}s") + return self.raw_cli(cmd) + + def object_storage_stop(self, immediate: bool): + cmd = ["object-storage", "stop"] + if immediate: + cmd.extend(["-m", "immediate"]) + return self.raw_cli(cmd) + pass + def pageserver_start( self, id: int, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d3cb35fe49..10bbb7020b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -14,6 +14,7 @@ import threading import time import uuid from collections import defaultdict +from collections.abc import Mapping from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime @@ -79,7 +80,12 @@ from fixtures.remote_storage import ( default_remote_storage, remote_storage_to_toml_dict, ) -from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.safekeeper.http import ( + MembershipConfiguration, + SafekeeperHttpClient, + SafekeeperId, + TimelineCreateRequest, +) from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( ATTACHMENT_NAME_REGEX, @@ -376,6 +382,28 @@ class PageserverWalReceiverProtocol(StrEnum): raise ValueError(f"Unknown protocol type: {proto}") +@dataclass +class PageserverTracingConfig: + sampling_ratio: tuple[int, int] + endpoint: str + protocol: str + timeout: str + + def to_config_key_value(self) -> tuple[str, dict[str, Any]]: + value = { + "sampling_ratio": { + "numerator": self.sampling_ratio[0], + "denominator": self.sampling_ratio[1], + }, + "export_config": { + "endpoint": self.endpoint, + "protocol": self.protocol, + "timeout": self.timeout, + }, + } + return ("tracing", value) + + class NeonEnvBuilder: """ Builder object to create a Neon runtime environment @@ -425,6 +453,7 @@ class NeonEnvBuilder: pageserver_virtual_file_io_mode: str | None = None, pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None, pageserver_get_vectored_concurrent_io: str | None = None, + pageserver_tracing_config: PageserverTracingConfig | None = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -478,6 +507,8 @@ class NeonEnvBuilder: pageserver_get_vectored_concurrent_io ) + self.pageserver_tracing_config = pageserver_tracing_config + self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = ( pageserver_default_tenant_config_compaction_algorithm ) @@ -916,6 +947,8 @@ class NeonEnvBuilder: continue if SMALL_DB_FILE_NAME_REGEX.fullmatch(test_file.name): continue + if FINAL_METRICS_FILE_NAME == test_file.name: + continue log.debug(f"Removing large database {test_file} file") test_file.unlink() elif test_entry.is_dir(): @@ -998,6 +1031,8 @@ class NeonEnvBuilder: self.env.broker.assert_no_errors() + self.env.object_storage.assert_no_errors() + try: self.overlay_cleanup_teardown() except Exception as e: @@ -1093,6 +1128,8 @@ class NeonEnv: pagectl_env_vars["RUST_LOG"] = self.rust_log_override self.pagectl = Pagectl(extra_env=pagectl_env_vars, binpath=self.neon_binpath) + self.object_storage = ObjectStorage(self) + # The URL for the pageserver to use as its control_plane_api config if config.storage_controller_port_override is not None: log.info( @@ -1138,6 +1175,7 @@ class NeonEnv: self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol self.pageserver_get_vectored_concurrent_io = config.pageserver_get_vectored_concurrent_io + self.pageserver_tracing_config = config.pageserver_tracing_config # Create the neon_local's `NeonLocalInitConf` cfg: dict[str, Any] = { @@ -1147,6 +1185,7 @@ class NeonEnv: }, "safekeepers": [], "pageservers": [], + "object_storage": {"port": self.port_distributor.get_port()}, "generate_local_ssl_certs": self.generate_local_ssl_certs, } @@ -1218,6 +1257,7 @@ class NeonEnv: "mode": "pipelined", "execution": "concurrent-futures", "max_batch_size": 32, + "batching": "scattered-lsn", } get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io @@ -1262,10 +1302,29 @@ class NeonEnv: if key not in ps_cfg: ps_cfg[key] = value + if self.pageserver_tracing_config is not None: + key, value = self.pageserver_tracing_config.to_config_key_value() + + if key not in ps_cfg: + ps_cfg[key] = value + + ps_cfg[key] = value + # Create a corresponding NeonPageserver object - self.pageservers.append( - NeonPageserver(self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"]) + ps = NeonPageserver( + self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"] ) + + if config.test_may_use_compatibility_snapshot_binaries: + # New features gated by pageserver config usually get rolled out in the + # test suite first, by enabling it in the `ps_cfg` abve. + # Compatibility tests run with old binaries that predate feature code & config. + # So, old binaries will warn about the flag's presence. + # Silence those warnings categorically. + log.info("test may use old binaries, ignoring warnings about unknown config items") + ps.allowed_errors.append(".*ignoring unknown configuration item.*") + + self.pageservers.append(ps) cfg["pageservers"].append(ps_cfg) # Create config and a Safekeeper object for each safekeeper @@ -1284,6 +1343,7 @@ class NeonEnv: "http_port": port.http, "https_port": port.https, "sync": config.safekeepers_enable_fsync, + "use_https_safekeeper_api": config.use_https_safekeeper_api, } if config.auth_enabled: sk_cfg["auth_enabled"] = True @@ -1344,6 +1404,8 @@ class NeonEnv: and self.storage_controller_config.get("timelines_onto_safekeepers") is True ): for sk_id, sk in enumerate(self.safekeepers): + # 0 is an invalid safekeeper id + sk_id = sk_id + 1 body = { "id": sk_id, "created_at": "2023-10-25T09:11:25Z", @@ -1360,6 +1422,8 @@ class NeonEnv: self.storage_controller.on_safekeeper_deploy(sk_id, body) self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active") + self.object_storage.start(timeout_in_seconds=timeout_in_seconds) + def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): """ After this method returns, there should be no child processes running. @@ -1377,6 +1441,8 @@ class NeonEnv: except Exception as e: raise_later = e + self.object_storage.stop(immediate=immediate) + # Stop storage controller before pageservers: we don't want it to spuriously # detect a pageserver "failure" during test teardown self.storage_controller.stop(immediate=immediate) @@ -1393,6 +1459,12 @@ class NeonEnv: except Exception as e: metric_errors.append(e) log.error(f"metric validation failed on {pageserver.id}: {e}") + + try: + pageserver.snapshot_final_metrics() + except Exception as e: + log.error(f"metric snapshot failed on {pageserver.id}: {e}") + try: pageserver.stop(immediate=immediate) except RuntimeError: @@ -1923,10 +1995,13 @@ class NeonStorageController(MetricsGetter, LogUtils): tenant_shard_id: TenantId | TenantShardId, pageserver_id: int, generation_override: int | None = None, + config: None | dict[str, Any] = None, ) -> int: body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id} if generation_override is not None: body["generation_override"] = generation_override + if config is not None: + body["config"] = config response = self.request( "POST", @@ -2587,6 +2662,26 @@ class NeonStorageController(MetricsGetter, LogUtils): self.stop(immediate=True) +class ObjectStorage(LogUtils): + def __init__(self, env: NeonEnv): + service_dir = env.repo_dir / "object_storage" + super().__init__(logfile=service_dir / "object_storage.log") + self.conf_path = service_dir / "object_storage.json" + self.env = env + + def base_url(self): + return json.loads(self.conf_path.read_text())["listen"] + + def start(self, timeout_in_seconds: int | None = None): + self.env.neon_cli.object_storage_start(timeout_in_seconds) + + def stop(self, immediate: bool = False): + self.env.neon_cli.object_storage_stop(immediate) + + def assert_no_errors(self): + assert_no_errors(self.logfile, "object_storage", []) + + class NeonProxiedStorageController(NeonStorageController): def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool, use_https: bool): super().__init__(env, proxy_port, auth_enabled, use_https) @@ -2801,13 +2896,14 @@ class NeonPageserver(PgProtocol, LogUtils): self, immediate: bool = False, timeout_in_seconds: int | None = None, + extra_env_vars: dict[str, str] | None = None, ): """ High level wrapper for restart: restarts the process, and waits for tenant state to stabilize. """ self.stop(immediate=immediate) - self.start(timeout_in_seconds=timeout_in_seconds) + self.start(timeout_in_seconds=timeout_in_seconds, extra_env_vars=extra_env_vars) self.quiesce_tenants() def quiesce_tenants(self): @@ -2884,6 +2980,20 @@ class NeonPageserver(PgProtocol, LogUtils): value = self.http_client().get_metric_value(metric) assert value == 0, f"Nonzero {metric} == {value}" + def snapshot_final_metrics(self): + """ + Take a snapshot of this pageserver's metrics and stash in its work directory. + """ + if not self.running: + log.info(f"Skipping metrics snapshot on pageserver {self.id}, it is not running") + return + + metrics = self.http_client().get_metrics_str() + metrics_snapshot_path = self.workdir / FINAL_METRICS_FILE_NAME + + with open(metrics_snapshot_path, "w") as f: + f.write(metrics) + def tenant_attach( self, tenant_id: TenantId, @@ -2896,11 +3006,12 @@ class NeonPageserver(PgProtocol, LogUtils): to call into the pageserver HTTP client. """ client = self.http_client() - if generation is None: - generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) - elif override_storage_controller_generation: + if generation is None or override_storage_controller_generation: generation = self.env.storage_controller.attach_hook_issue( - tenant_id, self.id, generation + tenant_id, + self.id, + generation_override=generation if override_storage_controller_generation else None, + config=config, ) return client.tenant_attach( tenant_id, @@ -4209,37 +4320,38 @@ class Endpoint(PgProtocol, LogUtils): # Write it back updated with open(config_path, "w") as file: - log.info(json.dumps(dict(data_dict, **kwargs))) + log.debug(json.dumps(dict(data_dict, **kwargs))) json.dump(dict(data_dict, **kwargs), file, indent=4) def respec_deep(self, **kwargs: Any) -> None: """ Update the endpoint.json file taking into account nested keys. - It does one level deep update. Should enough for most cases. Distinct method from respec() to do not break existing functionality. - NOTE: This method also updates the spec.json file, not endpoint.json. - We need it because neon_local also writes to spec.json, so intended + NOTE: This method also updates the config.json file, not endpoint.json. + We need it because neon_local also writes to config.json, so intended use-case is i) start endpoint with some config, ii) respec_deep(), iii) call reconfigure() to apply the changes. """ - config_path = os.path.join(self.endpoint_path(), "spec.json") - with open(config_path) as f: - data_dict: dict[str, Any] = json.load(f) - log.info("Current compute spec: %s", json.dumps(data_dict, indent=4)) - - for key, value in kwargs.items(): - if isinstance(value, dict): - if key not in data_dict: - data_dict[key] = value + def update(curr, patch): + for k, v in patch.items(): + if isinstance(v, Mapping): + curr[k] = update(curr.get(k, {}), v) else: - data_dict[key] = {**data_dict[key], **value} - else: - data_dict[key] = value + curr[k] = v + return curr + + config_path = os.path.join(self.endpoint_path(), "config.json") + with open(config_path) as f: + config: dict[str, Any] = json.load(f) + + log.debug("Current compute config: %s", json.dumps(config, indent=4)) + + update(config, kwargs) with open(config_path, "w") as file: - log.info("Updating compute spec to: %s", json.dumps(data_dict, indent=4)) - json.dump(data_dict, file, indent=4) + log.debug("Updating compute config to: %s", json.dumps(config, indent=4)) + json.dump(config, file, indent=4) def wait_for_migrations(self, wait_for: int = NUM_COMPUTE_MIGRATIONS) -> None: """ @@ -4256,7 +4368,7 @@ class Endpoint(PgProtocol, LogUtils): wait_until(check_migrations_done) # Mock the extension part of spec passed from control plane for local testing - # endpooint.rs adds content of this file as a part of the spec.json + # endpooint.rs adds content of this file as a part of the config.json def create_remote_extension_spec(self, spec: dict[str, Any]): """Create a remote extension spec file for the endpoint.""" remote_extensions_spec_path = os.path.join( @@ -4762,6 +4874,50 @@ class Safekeeper(LogUtils): wait_until(paused) + @staticmethod + def sks_to_safekeeper_ids(sks: list[Safekeeper]) -> list[SafekeeperId]: + return [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in sks] + + @staticmethod + def mconf_sks(env: NeonEnv, mconf: MembershipConfiguration) -> list[Safekeeper]: + """ + List of Safekeepers which are members in `mconf`. + """ + members_ids = [m.id for m in mconf.members] + new_members_ids = [m.id for m in mconf.new_members] if mconf.new_members is not None else [] + return [sk for sk in env.safekeepers if sk.id in members_ids or sk.id in new_members_ids] + + @staticmethod + def create_timeline( + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + mconf: MembershipConfiguration, + members_sks: list[Safekeeper], + ): + """ + Manually create timeline on safekeepers with given (presumably inital) + mconf: figure out LSN from pageserver, bake request and execute it on + given safekeepers. + + Normally done by storcon, but some tests want to do it manually so far. + """ + ps_http_cli = ps.http_client() + # figure out initial LSN. + ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id) + init_lsn = ps_timeline_detail["last_record_lsn"] + log.info(f"initial LSN: {init_lsn}") + # sk timeline creation request expects minor version + pg_version = ps_timeline_detail["pg_version"] * 10000 + # create inital mconf + create_r = TimelineCreateRequest( + tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None + ) + log.info(f"sending timeline create: {create_r.to_json()}") + + for sk in members_sks: + sk.http_client().timeline_create(create_r) + class NeonBroker(LogUtils): """An object managing storage_broker instance""" @@ -5000,6 +5156,8 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern[str] = re.compile( r"config-v1|heatmap-v1|tenant-manifest|metadata|.+\.(?:toml|pid|json|sql|conf)" ) +FINAL_METRICS_FILE_NAME: str = "final_metrics.txt" + SKIP_DIRS = frozenset( ( diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index c1c5f470cc..24c856e279 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -110,6 +110,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*delaying layer flush by \\S+ for compaction backpressure.*", ".*stalling layer flushes for compaction backpressure.*", ".*layer roll waiting for flush due to compaction backpressure.*", + ".*BatchSpanProcessor.*", ) @@ -118,6 +119,7 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ # failing to connect to them. ".*Call to node.*management API.*failed.*receive body.*", ".*Call to node.*management API.*failed.*ReceiveBody.*", + ".*Call to node.*management API.*failed.*Timeout.*", ".*Failed to update node .+ after heartbeat round.*error sending request for url.*", ".*background_reconcile: failed to fetch top tenants:.*client error \\(Connect\\).*", # Many tests will start up with a node offline diff --git a/test_runner/fixtures/pageserver/common_types.py b/test_runner/fixtures/pageserver/common_types.py index 0e068db593..0a92883e96 100644 --- a/test_runner/fixtures/pageserver/common_types.py +++ b/test_runner/fixtures/pageserver/common_types.py @@ -105,7 +105,7 @@ def parse_layer_file_name(file_name: str) -> LayerName: except InvalidFileName: pass - raise InvalidFileName("neither image nor delta layer") + raise InvalidFileName(f"neither image nor delta layer: {file_name}") def is_future_layer(layer_file_name: LayerName, disk_consistent_lsn: Lsn): diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 347bceb785..c2d176bf5a 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -853,6 +853,25 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res_json = res.json() return res_json + def timeline_mark_invisible( + self, + tenant_id: TenantId | TenantShardId, + timeline_id: TimelineId, + is_visible: bool | None = None, + ): + data = { + "is_visible": is_visible, + } + + log.info( + f"Requesting marking timeline invisible for {is_visible=}, {tenant_id=}, {timeline_id=}" + ) + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/mark_invisible", + json=data, + ) + self.verbose_error(res) + def timeline_get_timestamp_of_lsn( self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn ): @@ -1173,3 +1192,28 @@ class PageserverHttpClient(requests.Session, MetricsGetter): log.info(f"Got perf info response code: {res.status_code}") self.verbose_error(res) return res.json() + + def ingest_aux_files( + self, + tenant_id: TenantId | TenantShardId, + timeline_id: TimelineId, + aux_files: dict[str, bytes], + ): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/ingest_aux_files", + json={ + "aux_files": aux_files, + }, + ) + self.verbose_error(res) + return res.json() + + def list_aux_files( + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn + ) -> Any: + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/list_aux_files", + json={"lsn": str(lsn)}, + ) + self.verbose_error(res) + return res.json() diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index 4b066d6cf3..71c750b9eb 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -43,7 +43,7 @@ def single_timeline( f"template tenant is template_tenant={template_tenant} template_timeline={template_timeline}" ) - log.info("detach template tenant form pageserver") + log.info("detach template tenant from pageserver") env.pageserver.tenant_detach(template_tenant) log.info(f"duplicating template tenant {ncopies} times in remote storage") @@ -67,7 +67,7 @@ def single_timeline( def attach(tenant): env.pageserver.tenant_attach( tenant, - config=template_config.copy(), + config=template_config, generation=100, override_storage_controller_generation=True, ) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index bc5076758d..8f5234a2fa 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -199,7 +199,7 @@ def wait_for_last_record_lsn( """waits for pageserver to catch up to a certain lsn, returns the last observed lsn.""" current_lsn = Lsn(0) - for i in range(1000): + for i in range(2000): current_lsn = last_record_lsn(pageserver_http, tenant, timeline) if current_lsn >= lsn: return current_lsn diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index e409151b76..839e985419 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -25,7 +25,7 @@ class Walreceiver: @dataclass class SafekeeperTimelineStatus: - mconf: Configuration | None + mconf: MembershipConfiguration | None term: int last_log_term: int pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 @@ -78,17 +78,17 @@ class SafekeeperId: @dataclass -class Configuration: +class MembershipConfiguration: generation: int members: list[SafekeeperId] new_members: list[SafekeeperId] | None @classmethod - def from_json(cls, d: dict[str, Any]) -> Configuration: + def from_json(cls, d: dict[str, Any]) -> MembershipConfiguration: generation = d["generation"] members = d["members"] new_members = d.get("new_members") - return Configuration(generation, members, new_members) + return MembershipConfiguration(generation, members, new_members) def to_json(self) -> str: return json.dumps(self, cls=EnhancedJSONEncoder) @@ -98,7 +98,7 @@ class Configuration: class TimelineCreateRequest: tenant_id: TenantId timeline_id: TimelineId - mconf: Configuration + mconf: MembershipConfiguration # not exactly PgVersion, for example 150002 for 15.2 pg_version: int start_lsn: Lsn @@ -110,13 +110,13 @@ class TimelineCreateRequest: @dataclass class TimelineMembershipSwitchResponse: - previous_conf: Configuration - current_conf: Configuration + previous_conf: MembershipConfiguration + current_conf: MembershipConfiguration @classmethod def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse: - previous_conf = Configuration.from_json(d["previous_conf"]) - current_conf = Configuration.from_json(d["current_conf"]) + previous_conf = MembershipConfiguration.from_json(d["previous_conf"]) + current_conf = MembershipConfiguration.from_json(d["current_conf"]) return TimelineMembershipSwitchResponse(previous_conf, current_conf) @@ -194,7 +194,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): resj = res.json() walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] # It is always normally not None, it is allowed only to make forward compat tests happy. - mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None + mconf = MembershipConfiguration.from_json(resj["mconf"]) if "mconf" in resj else None return SafekeeperTimelineStatus( mconf=mconf, term=resj["acceptor_state"]["term"], @@ -223,7 +223,9 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): return self.timeline_status(tenant_id, timeline_id).commit_lsn # Get timeline membership configuration. - def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration: + def get_membership( + self, tenant_id: TenantId, timeline_id: TimelineId + ) -> MembershipConfiguration: # make mypy happy return self.timeline_status(tenant_id, timeline_id).mconf # type: ignore @@ -275,7 +277,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): return res_json def timeline_exclude( - self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration + self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration ) -> dict[str, Any]: res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude", @@ -287,7 +289,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): return res_json def membership_switch( - self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration + self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration ) -> TimelineMembershipSwitchResponse: res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership", diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 4ece6e89a8..13c2d320d1 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -724,3 +724,20 @@ def skip_on_ci(reason: str): os.getenv("CI", "false") == "true", reason=reason, ) + + +def shared_buffers_for_max_cu(max_cu: float) -> str: + """ + Returns the string value of shared_buffers for the given max CU. + Use shared_buffers size like in production for max CU compute. + See https://github.com/neondatabase/cloud/blob/877e33b4289a471b8f0a35c84009846358f3e5a3/goapp/controlplane/internal/pkg/compute/computespec/pg_settings.go#L405 + + e.g. // 2 CU: 225mb; 4 CU: 450mb; 8 CU: 900mb + """ + ramBytes = int(4096 * max_cu * 1024 * 1024) + maxConnections = max(100, min(int(ramBytes / 9531392), 4000)) + maxWorkerProcesses = 12 + int(max_cu * 2) + maxBackends = 1 + maxConnections + maxWorkerProcesses + sharedBuffersMb = int(max(128, (1023 + maxBackends * 256) / 1024)) + sharedBuffers = int(sharedBuffersMb * 1024 / 8) + return str(sharedBuffers) diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index efd423104d..8af52dcbd0 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -66,11 +66,11 @@ def test_basebackup_with_high_slru_count( n_txns = 500000 - def setup_wrapper(env: NeonEnv): - return setup_tenant_template(env, n_txns) - env = setup_pageserver_with_tenants( - neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper + neon_env_builder, + f"large_slru_count-{n_tenants}-{n_txns}", + n_tenants, + lambda env: setup_tenant_template(env, n_txns), ) run_benchmark(env, pg_bin, record, duration) @@ -80,10 +80,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int): "gc_period": "0s", # disable periodic gc "checkpoint_timeout": "10 years", "compaction_period": "0s", # disable periodic compaction - "compaction_threshold": 10, - "compaction_target_size": 134217728, - "checkpoint_distance": 268435456, - "image_creation_threshold": 3, } template_tenant, template_timeline = env.create_tenant(set_default=True) diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 3dbbb197f4..8874fe663b 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -10,14 +10,13 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + PageserverTracingConfig, PgBin, wait_for_last_flush_lsn, ) from fixtures.utils import get_scale_for_db, humantime_to_ms, skip_on_ci -from performance.pageserver.util import ( - setup_pageserver_with_tenants, -) +from performance.pageserver.util import setup_pageserver_with_tenants if TYPE_CHECKING: from typing import Any @@ -113,6 +112,15 @@ def setup_and_run_pagebench_benchmark( neon_env_builder.pageserver_config_override = ( f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}" ) + + tracing_config = PageserverTracingConfig( + sampling_ratio=(0, 1000), + endpoint="http://localhost:4318/v1/traces", + protocol="http-binary", + timeout="10s", + ) + neon_env_builder.pageserver_tracing_config = tracing_config + ratio = tracing_config.sampling_ratio[0] / tracing_config.sampling_ratio[1] params.update( { "pageserver_config_override.page_cache_size": ( @@ -120,20 +128,18 @@ def setup_and_run_pagebench_benchmark( {"unit": "byte"}, ), "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + "pageserver_config_override.sampling_ratio": (ratio, {"unit": ""}), } ) for param, (value, kwargs) in params.items(): record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) - def setup_wrapper(env: NeonEnv): - return setup_tenant_template(env, pg_bin, pgbench_scale) - env = setup_pageserver_with_tenants( neon_env_builder, f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}", n_tenants, - setup_wrapper, + lambda env: setup_tenant_template(env, pg_bin, pgbench_scale), # https://github.com/neondatabase/neon/issues/8070 timeout_in_seconds=60, ) @@ -160,14 +166,8 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): "gc_period": "0s", # disable periodic gc "checkpoint_timeout": "10 years", "compaction_period": "0s", # disable periodic compaction - "compaction_threshold": 10, - "compaction_target_size": 134217728, - "checkpoint_distance": 268435456, - "image_creation_threshold": 3, } - template_tenant, template_timeline = env.create_tenant(set_default=True) - env.pageserver.tenant_detach(template_tenant) - env.pageserver.tenant_attach(template_tenant, config) + template_tenant, template_timeline = env.create_tenant(set_default=True, conf=config) ps_http = env.pageserver.http_client() with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()]) diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py index 2c27368001..b17ca772c9 100644 --- a/test_runner/performance/pageserver/test_page_service_batching.py +++ b/test_runner/performance/pageserver/test_page_service_batching.py @@ -1,5 +1,7 @@ +import concurrent.futures import dataclasses import json +import threading import time from dataclasses import dataclass from pathlib import Path @@ -28,38 +30,33 @@ class PageServicePipeliningConfigSerial(PageServicePipeliningConfig): class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig): max_batch_size: int execution: str + batching: str mode: str = "pipelined" -EXECUTION = ["concurrent-futures", "tasks"] +EXECUTION = ["concurrent-futures"] +BATCHING = ["uniform-lsn", "scattered-lsn"] NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] for max_batch_size in [1, 32]: for execution in EXECUTION: - NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) + for batching in BATCHING: + NON_BATCHABLE.append( + PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) + ) -BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] -for max_batch_size in [1, 2, 4, 8, 16, 32]: +BATCHABLE: list[PageServicePipeliningConfig] = [] +for max_batch_size in [32]: for execution in EXECUTION: - BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) + for batching in BATCHING: + BATCHABLE.append( + PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) + ) @pytest.mark.parametrize( "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name", [ - # non-batchable workloads - # (A separate benchmark will consider latency). - *[ - ( - 50, - config, - TARGET_RUNTIME, - 1, - 128, - f"not batchable {dataclasses.asdict(config)}", - ) - for config in NON_BATCHABLE - ], # batchable workloads should show throughput and CPU efficiency improvements *[ ( @@ -137,7 +134,14 @@ def test_throughput( env = neon_env_builder.init_start() ps_http = env.pageserver.http_client() - endpoint = env.endpoints.create_start("main") + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + # minimal lfc & small shared buffers to force requests to pageserver + "neon.max_file_cache_size=1MB", + "shared_buffers=10MB", + ], + ) conn = endpoint.connect() cur = conn.cursor() @@ -155,7 +159,6 @@ def test_throughput( tablesize = tablesize_mib * 1024 * 1024 npages = tablesize // (8 * 1024) cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,)) - # TODO: can we force postgres to do sequential scans? # # Run the workload, collect `Metrics` before and after, calculate difference, normalize. @@ -166,6 +169,7 @@ def test_throughput( time: float pageserver_batch_size_histo_sum: float pageserver_batch_size_histo_count: float + pageserver_batch_breaks_reason_count: dict[str, int] compute_getpage_count: float pageserver_cpu_seconds_total: float @@ -179,6 +183,10 @@ def test_throughput( compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count, pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total - other.pageserver_cpu_seconds_total, + pageserver_batch_breaks_reason_count={ + reason: count - other.pageserver_batch_breaks_reason_count.get(reason, 0) + for reason, count in self.pageserver_batch_breaks_reason_count.items() + }, ) def normalize(self, by) -> "Metrics": @@ -188,6 +196,10 @@ def test_throughput( pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by, compute_getpage_count=self.compute_getpage_count / by, pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by, + pageserver_batch_breaks_reason_count={ + reason: count / by + for reason, count in self.pageserver_batch_breaks_reason_count.items() + }, ) def get_metrics() -> Metrics: @@ -197,6 +209,20 @@ def test_throughput( ) compute_getpage_count = cur.fetchall()[0][0] pageserver_metrics = ps_http.get_metrics() + for name, samples in pageserver_metrics.metrics.items(): + for sample in samples: + log.info(f"{name=} labels={sample.labels} {sample.value}") + + raw_batch_break_reason_count = pageserver_metrics.query_all( + "pageserver_page_service_batch_break_reason_total", + filter={"timeline_id": str(env.initial_timeline)}, + ) + + batch_break_reason_count = { + sample.labels["reason"]: int(sample.value) + for sample in raw_batch_break_reason_count + } + return Metrics( time=time.time(), pageserver_batch_size_histo_sum=pageserver_metrics.query_one( @@ -205,34 +231,58 @@ def test_throughput( pageserver_batch_size_histo_count=pageserver_metrics.query_one( "pageserver_page_service_batch_size_count" ).value, + pageserver_batch_breaks_reason_count=batch_break_reason_count, compute_getpage_count=compute_getpage_count, pageserver_cpu_seconds_total=pageserver_metrics.query_one( "libmetrics_process_cpu_seconds_highres" ).value, ) - def workload() -> Metrics: + def workload(disruptor_started: threading.Event) -> Metrics: + disruptor_started.wait() start = time.time() iters = 0 while time.time() - start < target_runtime or iters < 2: - log.info("Seqscan %d", iters) if iters == 1: # round zero for warming up before = get_metrics() - cur.execute( - "select clear_buffer_cache()" - ) # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests cur.execute("select sum(data::bigint) from t") assert cur.fetchall()[0][0] == npages * (npages + 1) // 2 iters += 1 after = get_metrics() return (after - before).normalize(iters - 1) + def disruptor(disruptor_started: threading.Event, stop_disruptor: threading.Event): + conn = endpoint.connect() + cur = conn.cursor() + iters = 0 + while True: + cur.execute("SELECT pg_logical_emit_message(true, 'test', 'advancelsn')") + if stop_disruptor.is_set(): + break + disruptor_started.set() + iters += 1 + time.sleep(0.001) + return iters + env.pageserver.patch_config_toml_nonrecursive( {"page_service_pipelining": dataclasses.asdict(pipelining_config)} ) - env.pageserver.restart() - metrics = workload() + + # set trace for log analysis below + env.pageserver.restart(extra_env_vars={"RUST_LOG": "info,pageserver::page_service=trace"}) + + log.info("Starting workload") + + with concurrent.futures.ThreadPoolExecutor() as executor: + disruptor_started = threading.Event() + stop_disruptor = threading.Event() + disruptor_fut = executor.submit(disruptor, disruptor_started, stop_disruptor) + workload_fut = executor.submit(workload, disruptor_started) + metrics = workload_fut.result() + stop_disruptor.set() + ndisruptions = disruptor_fut.result() + log.info("Disruptor issued %d disrupting requests", ndisruptions) log.info("Results: %s", metrics) @@ -249,7 +299,16 @@ def test_throughput( # for metric, value in dataclasses.asdict(metrics).items(): - zenbenchmark.record(f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM) + if metric == "pageserver_batch_breaks_reason_count": + assert isinstance(value, dict) + for reason, count in value.items(): + zenbenchmark.record( + f"counters.{metric}_{reason}", count, unit="", report=MetricReport.TEST_PARAM + ) + else: + zenbenchmark.record( + f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM + ) zenbenchmark.record( "perfmetric.batching_factor", @@ -262,7 +321,10 @@ def test_throughput( PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] for max_batch_size in [1, 32]: for execution in EXECUTION: - PRECISION_CONFIGS.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) + for batching in BATCHING: + PRECISION_CONFIGS.append( + PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) + ) @pytest.mark.parametrize( diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index 7a6d88f79c..b50659defc 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -40,6 +40,8 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): for layer in info.historic_layers: assert not layer.remote + env.storage_controller.reconcile_until_idle(timeout_secs=60) + log.info("ready") diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index b2bd94fae7..a3ee30cda2 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -97,6 +97,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) _record_branch_creation_durations(neon_compare, branch_creation_durations) +@pytest.mark.timeout(1000) @pytest.mark.parametrize("n_branches", [500, 1024]) @pytest.mark.parametrize("shape", ["one_ancestor", "random"]) def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str): @@ -205,7 +206,7 @@ def wait_and_record_startup_metrics( assert len(matching) == len(expected_labels) return matching - samples = wait_until(metrics_are_filled) + samples = wait_until(metrics_are_filled, timeout=60) for sample in samples: phase = sample.labels["phase"] diff --git a/test_runner/performance/test_bulk_update.py b/test_runner/performance/test_bulk_update.py index 6946bc66f2..16606268f4 100644 --- a/test_runner/performance/test_bulk_update.py +++ b/test_runner/performance/test_bulk_update.py @@ -2,6 +2,7 @@ from __future__ import annotations import pytest from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn +from fixtures.utils import shared_buffers_for_max_cu # @@ -20,7 +21,10 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor) timeline_id = env.create_branch("test_bulk_update") tenant_id = env.initial_tenant - endpoint = env.endpoints.create_start("test_bulk_update") + # use shared_buffers size like in production for 8 CU compute + endpoint = env.endpoints.create_start( + "test_bulk_update", config_lines=[f"shared_buffers={shared_buffers_for_max_cu(8.0)}"] + ) cur = endpoint.connect().cursor() cur.execute("set statement_timeout=0") diff --git a/test_runner/performance/test_ingest_insert_bulk.py b/test_runner/performance/test_ingest_insert_bulk.py index 283bcada31..ed0a6c70bd 100644 --- a/test_runner/performance/test_ingest_insert_bulk.py +++ b/test_runner/performance/test_ingest_insert_bulk.py @@ -17,9 +17,10 @@ from fixtures.pageserver.utils import ( wait_for_upload_queue_empty, ) from fixtures.remote_storage import s3_storage +from fixtures.utils import shared_buffers_for_max_cu -@pytest.mark.timeout(900) +@pytest.mark.timeout(1800) @pytest.mark.parametrize("size", [8, 1024, 8192]) @pytest.mark.parametrize("s3", [True, False], ids=["s3", "local"]) @pytest.mark.parametrize("backpressure", [True, False], ids=["backpressure", "nobackpressure"]) @@ -51,6 +52,8 @@ def test_ingest_insert_bulk( # would compete with Pageserver for bandwidth. # neon_env_builder.enable_safekeeper_remote_storage(s3_storage()) + neon_env_builder.pageserver_config_override = "wait_lsn_timeout='600 s'" + neon_env_builder.disable_scrub_on_exit() # immediate shutdown may leave stray layers env = neon_env_builder.init_start() @@ -60,6 +63,8 @@ def test_ingest_insert_bulk( f"fsync = {fsync}", "max_replication_apply_lag = 0", f"max_replication_flush_lag = {'10GB' if backpressure else '0'}", + # use shared_buffers size like in production for 8 CU compute + f"shared_buffers={shared_buffers_for_max_cu(8.0)}", # NB: neon_local defaults to 15MB, which is too slow -- production uses 500MB. f"max_replication_write_lag = {'500MB' if backpressure else '0'}", ], @@ -89,7 +94,18 @@ def test_ingest_insert_bulk( worker_rows = rows / CONCURRENCY pool.submit(insert_rows, endpoint, f"table{i}", worker_rows, value) - end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + for attempt in range(5): + try: + end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + break + except Exception as e: + # if we disable backpressure, postgres can become unresponsive for longer than a minute + # and new connection attempts time out in postgres after 1 minute + # so if this happens we retry new connection + log.error(f"Attempt {attempt + 1}/5: Failed to select current wal lsn: {e}") + if attempt == 4: + log.error("Exceeded maximum retry attempts for selecting current wal lsn") + raise # Wait for pageserver to ingest the WAL. client = env.pageserver.http_client() diff --git a/test_runner/performance/test_ingest_logical_message.py b/test_runner/performance/test_ingest_logical_message.py index b55cb68b64..bc16e3964d 100644 --- a/test_runner/performance/test_ingest_logical_message.py +++ b/test_runner/performance/test_ingest_logical_message.py @@ -12,7 +12,7 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.utils import wait_for_last_record_lsn -@pytest.mark.timeout(600) +@pytest.mark.timeout(1200) @pytest.mark.parametrize("size", [1024, 8192, 131072]) @pytest.mark.parametrize("fsync", [True, False], ids=["fsync", "nofsync"]) def test_ingest_logical_message( diff --git a/test_runner/performance/test_parallel_copy.py b/test_runner/performance/test_parallel_copy.py index f1d1c1904b..f7f20bd33e 100644 --- a/test_runner/performance/test_parallel_copy.py +++ b/test_runner/performance/test_parallel_copy.py @@ -7,6 +7,8 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: from fixtures.neon_fixtures import Endpoint, NeonEnv +from fixtures.utils import shared_buffers_for_max_cu + async def repeat_bytes(buf, repetitions: int): for _ in range(repetitions): @@ -45,7 +47,10 @@ async def parallel_load_same_table(endpoint: Endpoint, n_parallel: int): # Load data into one table with COPY TO from 5 parallel connections def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): env = neon_simple_env - endpoint = env.endpoints.create_start("main") + # use shared_buffers size like in production for 8 CU compute + endpoint = env.endpoints.create_start( + "main", config_lines=[f"shared_buffers={shared_buffers_for_max_cu(8.0)}"] + ) # Create test table conn = endpoint.connect() diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py index e2f0a79018..81dae53759 100644 --- a/test_runner/performance/test_perf_many_relations.py +++ b/test_runner/performance/test_perf_many_relations.py @@ -6,6 +6,7 @@ from fixtures.benchmark_fixture import NeonBenchmarker from fixtures.compare_fixtures import RemoteCompare from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import shared_buffers_for_max_cu def get_num_relations(default: int = 1000) -> list[int]: @@ -78,7 +79,8 @@ def test_perf_simple_many_relations_reldir_v2( ep = env.endpoints.create_start( "main", config_lines=[ - "shared_buffers=1000MB", + # use shared_buffers size like in production for 8 CU compute + f"shared_buffers={shared_buffers_for_max_cu(8.0)}", "max_locks_per_transaction=16384", ], ) diff --git a/test_runner/performance/test_perf_oltp_large_tenant.py b/test_runner/performance/test_perf_oltp_large_tenant.py index 957a4ec796..b45394d627 100644 --- a/test_runner/performance/test_perf_oltp_large_tenant.py +++ b/test_runner/performance/test_perf_oltp_large_tenant.py @@ -145,11 +145,14 @@ def run_database_maintenance(env: PgCompare): END $$; """ ) - - log.info("start REINDEX TABLE CONCURRENTLY transaction.transaction") - with env.zenbenchmark.record_duration("reindex concurrently"): - cur.execute("REINDEX TABLE CONCURRENTLY transaction.transaction;") - log.info("finished REINDEX TABLE CONCURRENTLY transaction.transaction") + # in production a customer would likely use reindex concurrently + # but for our test we don't care about the downtime + # and it would just about double the time we report in the test + # because we need one more table scan for each index + log.info("start REINDEX TABLE transaction.transaction") + with env.zenbenchmark.record_duration("reindex"): + cur.execute("REINDEX TABLE transaction.transaction;") + log.info("finished REINDEX TABLE transaction.transaction") @pytest.mark.parametrize("custom_scripts", get_custom_scripts()) diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index 6351f03e08..16cdab155a 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -7,7 +7,6 @@ import traceback from typing import TYPE_CHECKING import psycopg2 -import psycopg2.extras import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.common_types import Lsn @@ -26,7 +25,11 @@ if TYPE_CHECKING: # Granularity of ~0.5 sec -def measure_replication_lag(master, replica, timeout_sec=600): +def measure_replication_lag( + master: psycopg2.extensions.cursor, + replica: psycopg2.extensions.cursor, + timeout_sec: int = 600, +): start = time.time() master.execute("SELECT pg_current_wal_flush_lsn()") master_lsn = Lsn(master.fetchall()[0][0]) @@ -40,7 +43,7 @@ def measure_replication_lag(master, replica, timeout_sec=600): raise TimeoutError(f"Replication sync took more than {timeout_sec} sec") -def check_pgbench_still_running(pgbench): +def check_pgbench_still_running(pgbench: subprocess.Popen[str]): rc = pgbench.poll() if rc is not None: raise RuntimeError(f"Pgbench terminated early with return code {rc}") @@ -61,6 +64,8 @@ def test_ro_replica_lag( project = neon_api.create_project(pg_version) project_id = project["project"]["id"] + log.info("Project ID: %s", project_id) + log.info("Primary endpoint ID: %s", project["project"]["endpoints"][0]["id"]) neon_api.wait_for_operation_to_finish(project_id) error_occurred = False try: @@ -76,6 +81,7 @@ def test_ro_replica_lag( endpoint_type="read_only", settings={"pg_settings": {"hot_standby_feedback": "on"}}, ) + log.info("Replica endpoint ID: %s", replica["endpoint"]["id"]) replica_env = master_env.copy() replica_env["PGHOST"] = replica["endpoint"]["host"] neon_api.wait_for_operation_to_finish(project_id) @@ -191,6 +197,8 @@ def test_replication_start_stop( project = neon_api.create_project(pg_version) project_id = project["project"]["id"] + log.info("Project ID: %s", project_id) + log.info("Primary endpoint ID: %s", project["project"]["endpoints"][0]["id"]) neon_api.wait_for_operation_to_finish(project_id) try: branch_id = project["branch"]["id"] @@ -200,15 +208,15 @@ def test_replication_start_stop( ) replicas = [] - for _ in range(num_replicas): - replicas.append( - neon_api.create_endpoint( - project_id, - branch_id, - endpoint_type="read_only", - settings={"pg_settings": {"hot_standby_feedback": "on"}}, - ) + for i in range(num_replicas): + replica = neon_api.create_endpoint( + project_id, + branch_id, + endpoint_type="read_only", + settings={"pg_settings": {"hot_standby_feedback": "on"}}, ) + log.info("Replica %d endpoint ID: %s", i + 1, replica["endpoint"]["id"]) + replicas.append(replica) neon_api.wait_for_operation_to_finish(project_id) replica_connstr = [ diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py index 94fd54bade..293026d40a 100644 --- a/test_runner/performance/test_sharded_ingest.py +++ b/test_runner/performance/test_sharded_ingest.py @@ -13,7 +13,7 @@ from fixtures.neon_fixtures import ( ) -@pytest.mark.timeout(600) +@pytest.mark.timeout(1200) @pytest.mark.parametrize("shard_count", [1, 8, 32]) @pytest.mark.parametrize( "wal_receiver_protocol", diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index 0b138bf167..22c0e461b5 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "addr2line" @@ -421,9 +421,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "openssl" -version = "0.10.70" +version = "0.10.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6" +checksum = "fedfea7d58a1f73118430a55da6a286e7b044961736ce96a16a17068ea25e5da" dependencies = [ "bitflags 2.6.0", "cfg-if", @@ -453,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.105" +version = "0.9.107" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc" +checksum = "8288979acd84749c744a9014b4382d42b8f7b2592847b5afb2ed29e5d16ede07" dependencies = [ "cc", "libc", @@ -808,9 +808,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.38.0" +version = "1.38.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" +checksum = "68722da18b0fc4a05fdc1120b302b82051265792a1e1b399086e9b204b10ad3d" dependencies = [ "backtrace", "bytes", diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 22dfcbda92..9b6930695c 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -187,9 +187,14 @@ def test_fully_custom_config(positive_env: NeonEnv): }, "rel_size_v2_enabled": False, # test suite enables it by default as of https://github.com/neondatabase/neon/issues/11081, so, custom config means disabling it "gc_compaction_enabled": True, + "gc_compaction_verification": False, "gc_compaction_initial_threshold_kb": 1024000, "gc_compaction_ratio_percent": 200, "image_creation_preempt_threshold": 5, + "sampling_ratio": { + "numerator": 0, + "denominator": 10, + }, } vps_http = env.storage_controller.pageserver_api() diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index e5f5b80d2d..84d37de9f1 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -38,12 +38,34 @@ PREEMPT_COMPACTION_TENANT_CONF = { "compaction_target_size": 1024**2, "image_creation_threshold": 1, "image_creation_preempt_threshold": 1, - # compact more frequently + # Compact more frequently "compaction_threshold": 3, "compaction_upper_limit": 6, "lsn_lease_length": "0s", } +PREEMPT_GC_COMPACTION_TENANT_CONF = { + "gc_period": "5s", + "compaction_period": "5s", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024**2, + # Compact small layers + "compaction_target_size": 1024**2, + "image_creation_threshold": 10000, # Do not create image layers at all + "image_creation_preempt_threshold": 10000, + # Compact more frequently + "compaction_threshold": 3, + "compaction_upper_limit": 6, + "lsn_lease_length": "0s", + # Enable gc-compaction + "gc_compaction_enabled": "true", + "gc_compaction_initial_threshold_kb": 1024, # At a small threshold + "gc_compaction_ratio_percent": 1, + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": f"{1024**2}", +} + @skip_in_debug_build("only run with release build") @pytest.mark.parametrize( @@ -140,6 +162,8 @@ def test_pageserver_compaction_preempt( conf = PREEMPT_COMPACTION_TENANT_CONF.copy() env = neon_env_builder.init_start(initial_tenant_conf=conf) + env.pageserver.allowed_errors.append(".*The timeline or pageserver is shutting down.*") + tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -166,6 +190,42 @@ def test_pageserver_compaction_preempt( @skip_in_debug_build("only run with release build") +def test_pageserver_gc_compaction_preempt( + neon_env_builder: NeonEnvBuilder, +): + # Ideally we should be able to do unit tests for this, but we need real Postgres + # WALs in order to do unit testing... + + conf = PREEMPT_GC_COMPACTION_TENANT_CONF.copy() + env = neon_env_builder.init_start(initial_tenant_conf=conf) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 200000 + churn_rounds = 10 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + for i in range(1, churn_rounds + 1): + log.info(f"Running churn round {i}/{churn_rounds} ...") + workload.churn_rows(row_count, env.pageserver.id, upload=False) + workload.validate(env.pageserver.id) + ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True) + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + # ensure gc_compaction gets preempted and then resumed + env.pageserver.assert_log_contains("preempt gc-compaction") + + +@skip_in_debug_build("only run with release build") +@pytest.mark.timeout(900) # This test is slow with sanitizers enabled, especially on ARM @pytest.mark.parametrize( "with_branches", ["with_branches", "no_branches"], diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index f61778e4c5..e23b1e0bca 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -101,7 +101,7 @@ if TYPE_CHECKING: # export CHECK_ONDISK_DATA_COMPATIBILITY=true # export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} # export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install -# export NEON_BIN=target/release +# export NEON_BIN=target/${BUILD_TYPE} # export POSTGRES_DISTRIB_DIR=pg_install # # # Build previous version of binaries and store them somewhere: @@ -148,9 +148,9 @@ def test_create_snapshot( env = neon_env_builder.init_start( initial_tenant_conf={ # Miniature layers to enable generating non-trivial layer map without writing lots of data. - "checkpoint_distance": f"{128 * 1024}", - "compaction_threshold": "1", - "compaction_target_size": f"{128 * 1024}", + "checkpoint_distance": f"{256 * 1024}", + "compaction_threshold": "5", + "compaction_target_size": f"{256 * 1024}", } ) endpoint = env.endpoints.create_start("main") @@ -249,6 +249,7 @@ def test_forward_compatibility( top_output_dir: Path, pg_version: PgVersion, compatibility_snapshot_dir: Path, + compute_reconfigure_listener: ComputeReconfigure, ): """ Test that the old binaries can read new data @@ -257,6 +258,7 @@ def test_forward_compatibility( os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" ) + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api neon_env_builder.test_may_use_compatibility_snapshot_binaries = True try: @@ -490,6 +492,13 @@ HISTORIC_DATA_SETS = [ PgVersion.V17, "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-02-07-pgv17-nogenerations.tar.zst", ), + # Tenant manifest v1. + HistoricDataSet( + "2025-04-08-tenant-manifest-v1", + TenantId("c547c28588abf1d7b7139ff1f1158345"), + PgVersion.V17, + "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-04-08-pgv17-tenant-manifest-v1.tar.zst", + ), ] diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index c1f05830b7..37208c9fff 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -90,10 +90,12 @@ def test_compute_catalog(neon_simple_env: NeonEnv): # and reconfigure the endpoint to create some test databases. endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "roles": TEST_ROLE_NAMES, - "databases": TEST_DB_NAMES, + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "roles": TEST_ROLE_NAMES, + "databases": TEST_DB_NAMES, + }, }, } ) @@ -155,10 +157,12 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv): # and reconfigure the endpoint to apply the changes. endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "roles": TEST_ROLE_NAMES, - "databases": TEST_DB_NAMES, + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "roles": TEST_ROLE_NAMES, + "databases": TEST_DB_NAMES, + }, }, } ) @@ -196,12 +200,14 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv): endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "roles": [], - "databases": [], + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "roles": [], + "databases": [], + }, + "delta_operations": delta_operations, }, - "delta_operations": delta_operations, } ) endpoint.reconfigure() @@ -250,9 +256,11 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): # and reconfigure the endpoint to apply the changes. endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "databases": TEST_DB_NAMES, + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES, + }, }, } ) @@ -306,17 +314,19 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): # and reconfigure the endpoint to apply the changes. endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "databases": TEST_DB_NAMES_NEW, + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES_NEW, + }, + "delta_operations": [ + {"action": "delete_db", "name": SUB_DB_NAME}, + # also test the case when we try to delete a non-existent database + # shouldn't happen in normal operation, + # but can occur when failed operations are retried + {"action": "delete_db", "name": "nonexistent_db"}, + ], }, - "delta_operations": [ - {"action": "delete_db", "name": SUB_DB_NAME}, - # also test the case when we try to delete a non-existent database - # shouldn't happen in normal operation, - # but can occur when failed operations are retried - {"action": "delete_db", "name": "nonexistent_db"}, - ], } ) @@ -354,25 +364,27 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "roles": [ - { - # We need to create role via compute_ctl, because in this case it will receive - # additional grants equivalent to our real environment, so we can repro some - # issues. - "name": "neon", - # Some autocomplete-suggested hash, no specific meaning. - "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=", - "options": [], - }, - ], - "databases": [ - { - "name": TEST_DB_NAME, - "owner": "neon", - }, - ], + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "roles": [ + { + # We need to create role via compute_ctl, because in this case it will receive + # additional grants equivalent to our real environment, so we can repro some + # issues. + "name": "neon", + # Some autocomplete-suggested hash, no specific meaning. + "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=", + "options": [], + }, + ], + "databases": [ + { + "name": TEST_DB_NAME, + "owner": "neon", + }, + ], + }, }, } ) @@ -415,13 +427,15 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne # Drop role via compute_ctl endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "delta_operations": [ - { - "action": "delete_role", - "name": TEST_GRANTEE, - }, - ], + "spec": { + "skip_pg_catalog_updates": False, + "delta_operations": [ + { + "action": "delete_role", + "name": TEST_GRANTEE, + }, + ], + }, } ) endpoint.reconfigure() @@ -444,13 +458,15 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "delta_operations": [ - { - "action": "delete_role", - "name": "readonly2", - }, - ], + "spec": { + "skip_pg_catalog_updates": False, + "delta_operations": [ + { + "action": "delete_role", + "name": "readonly2", + }, + ], + }, } ) endpoint.reconfigure() @@ -475,25 +491,27 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env endpoint = env.endpoints.create_start("main") endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "roles": [ - { - # We need to create role via compute_ctl, because in this case it will receive - # additional grants equivalent to our real environment, so we can repro some - # issues. - "name": TEST_GRANTOR, - # Some autocomplete-suggested hash, no specific meaning. - "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=", - "options": [], - }, - ], - "databases": [ - { - "name": TEST_DB_NAME, - "owner": TEST_GRANTOR, - }, - ], + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "roles": [ + { + # We need to create role via compute_ctl, because in this case it will receive + # additional grants equivalent to our real environment, so we can repro some + # issues. + "name": TEST_GRANTOR, + # Some autocomplete-suggested hash, no specific meaning. + "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=", + "options": [], + }, + ], + "databases": [ + { + "name": TEST_DB_NAME, + "owner": TEST_GRANTOR, + }, + ], + }, }, } ) @@ -507,13 +525,15 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "delta_operations": [ - { - "action": "delete_role", - "name": TEST_GRANTEE, - }, - ], + "spec": { + "skip_pg_catalog_updates": False, + "delta_operations": [ + { + "action": "delete_role", + "name": TEST_GRANTEE, + }, + ], + }, } ) endpoint.reconfigure() diff --git a/test_runner/regress/test_compute_reconfigure.py b/test_runner/regress/test_compute_reconfigure.py index ed453f3f8d..b533d45b1e 100644 --- a/test_runner/regress/test_compute_reconfigure.py +++ b/test_runner/regress/test_compute_reconfigure.py @@ -1,7 +1,9 @@ from __future__ import annotations +import os from typing import TYPE_CHECKING +from fixtures.metrics import parse_metrics from fixtures.utils import wait_until if TYPE_CHECKING: @@ -29,15 +31,17 @@ def test_compute_reconfigure(neon_simple_env: NeonEnv): endpoint.respec_deep( **{ - "skip_pg_catalog_updates": True, - "cluster": { - "settings": [ - { - "name": "log_line_prefix", - "vartype": "string", - "value": TEST_LOG_LINE_PREFIX, - } - ] + "spec": { + "skip_pg_catalog_updates": True, + "cluster": { + "settings": [ + { + "name": "log_line_prefix", + "vartype": "string", + "value": TEST_LOG_LINE_PREFIX, + } + ] + }, }, } ) @@ -64,3 +68,20 @@ def test_compute_reconfigure(neon_simple_env: NeonEnv): row = cursor.fetchone() assert row is not None assert row[0] == TEST_LOG_LINE_PREFIX + + # Check that even after reconfigure and state transitions we still report + # only the current status. + client = endpoint.http_client() + raw_metrics = client.metrics() + metrics = parse_metrics(raw_metrics) + samples = metrics.query_all("compute_ctl_up") + assert len(samples) == 1 + assert samples[0].value == 1 + samples = metrics.query_all("compute_ctl_up", {"status": "running"}) + assert len(samples) == 1 + assert samples[0].value == 1 + # Check that build tag is reported + build_tag = os.environ.get("BUILD_TAG", "latest") + samples = metrics.query_all("compute_ctl_up", {"build_tag": build_tag}) + assert len(samples) == 1 + assert samples[0].value == 1 diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 4c196a099b..c83004583a 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -126,7 +126,7 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder): ps_metrics = env.pageserver.http_client().get_metrics() total = 0.0 for sample in ps_metrics.query_all( - name="pageserver_remote_operation_seconds_count", + name="pageserver_remote_timeline_client_seconds_global_count", filter={ "file_kind": str(file_kind), "op_kind": str(op_kind), diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index ca794f6685..6b3b71f29c 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -19,6 +19,7 @@ from fixtures.pageserver.http import ( from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import MockS3Server, RemoteStorageKind +from fixtures.utils import shared_buffers_for_max_cu from mypy_boto3_kms import KMSClient from mypy_boto3_kms.type_defs import EncryptResponseTypeDef from mypy_boto3_s3 import S3Client @@ -80,7 +81,8 @@ def test_pgdata_import_smoke( # doesn't allow any prefetching on v17 and above, where the new streaming # read machinery keeps buffers pinned while prefetching them. Use a higher # setting to enable prefetching and speed up the tests - ep_config = ["shared_buffers=64MB"] + # use shared_buffers size like in production for 8 CU compute + ep_config = [f"shared_buffers={shared_buffers_for_max_cu(8.0)}"] # # Put data in vanilla pg diff --git a/test_runner/regress/test_lfc_prefetch.py b/test_runner/regress/test_lfc_prefetch.py index 27a5416eff..2885c0e17b 100644 --- a/test_runner/regress/test_lfc_prefetch.py +++ b/test_runner/regress/test_lfc_prefetch.py @@ -100,5 +100,5 @@ def test_lfc_prefetch(neon_simple_env: NeonEnv): prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] log.info(f"Unused prefetches: {prefetch_expired}") - # No redundant prefethc requrests if prefetch results are stored in LFC + # No redundant prefetch requests if prefetch results are stored in LFC assert prefetch_expired == 0 diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 51074751e0..83fd3aa719 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -49,6 +49,8 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): conn = endpoint.connect() cur = conn.cursor() + cur.execute("create extension neon") + def get_lfc_size() -> tuple[int, int]: lfc_file_path = endpoint.lfc_path() lfc_file_size = lfc_file_path.stat().st_size @@ -103,3 +105,23 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): time.sleep(1) assert int(lfc_file_blocks) <= 128 * 1024 + + # Now test that number of rows returned by local_cache is the same as file_cache_used_pages. + # Perform several iterations to make cache cache content stabilized. + nretries = 10 + while True: + cur.execute("select count(*) from local_cache") + local_cache_size = cur.fetchall()[0][0] + + cur.execute( + "select lfc_value::bigint FROM neon_lfc_stats where lfc_key='file_cache_used_pages'" + ) + used_pages = cur.fetchall()[0][0] + + if local_cache_size == used_pages or nretries == 0: + break + + nretries = nretries - 1 + time.sleep(1) + + assert local_cache_size == used_pages diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 7280a91a12..c5a1bf0d16 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -276,3 +276,34 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): if i > 1: before_timestamp = tbl[i - step_size][1] assert timestamp >= before_timestamp, "before_timestamp before timestamp" + + +def test_timestamp_of_lsn_empty_branch(neon_env_builder: NeonEnvBuilder): + """ + Test that getting the timestamp of the head LSN of a newly created branch works. + This verifies that we don't get a 404 error when trying to get the timestamp + of the head LSN of a branch that was just created. + We now return a special status code 412 to indicate if there is no timestamp found for lsn. + + Reproducer for https://github.com/neondatabase/neon/issues/11439 + """ + env = neon_env_builder.init_start() + + # Create a new branch + new_timeline_id = env.create_branch("test_timestamp_of_lsn_empty_branch") + + # Retrieve the commit LSN of the empty branch, which we have never run postgres on + detail = env.pageserver.http_client().timeline_detail( + tenant_id=env.initial_tenant, timeline_id=new_timeline_id + ) + head_lsn = detail["last_record_lsn"] + + # Verify that we get 412 status code + with env.pageserver.http_client() as client: + with pytest.raises(PageserverApiException) as err: + client.timeline_get_timestamp_of_lsn( + env.initial_tenant, + new_timeline_id, + head_lsn, + ) + assert err.value.status_code == 412 diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index 8bd0662ef8..e6bcdf8e67 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -134,10 +134,11 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): """ env = neon_env_builder.init_start() - # Stop default ps/sk + # Stop default services env.neon_cli.pageserver_stop(env.pageserver.id) env.neon_cli.safekeeper_stop() env.neon_cli.storage_controller_stop(False) + env.neon_cli.object_storage_stop(False) env.neon_cli.storage_broker_stop() # Keep NeonEnv state up to date, it usually owns starting/stopping services @@ -179,11 +180,13 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder): # Using the single-pageserver shortcut property throws when there are multiple pageservers with pytest.raises(AssertionError): - _drop = env.pageserver + _ = env.pageserver env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 1) env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2) + env.neon_cli.object_storage_stop(False) + # Stop this to get out of the way of the following `start` env.neon_cli.storage_controller_stop(False) env.neon_cli.storage_broker_stop() diff --git a/test_runner/regress/test_object_storage.py b/test_runner/regress/test_object_storage.py new file mode 100644 index 0000000000..0b1cfa344f --- /dev/null +++ b/test_runner/regress/test_object_storage.py @@ -0,0 +1,56 @@ +from time import time + +import pytest +from aiohttp import ClientSession +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from jwcrypto import jwk, jwt + + +@pytest.mark.asyncio +async def test_object_storage_insert_retrieve_delete(neon_simple_env: NeonEnv): + """ + Inserts, retrieves, and deletes test file using a JWT token + """ + env = neon_simple_env + ep = env.endpoints.create_start(branch_name="main") + tenant_id = str(ep.tenant_id) + timeline_id = str(ep.show_timeline_id()) + endpoint_id = ep.endpoint_id + + key_path = env.repo_dir / "auth_private_key.pem" + key = jwk.JWK.from_pem(key_path.read_bytes()) + claims = { + "tenant_id": tenant_id, + "timeline_id": timeline_id, + "endpoint_id": endpoint_id, + "exp": round(time()) + 99, + } + log.info(f"key path {key_path}\nclaims {claims}") + token = jwt.JWT(header={"alg": "EdDSA"}, claims=claims) + token.make_signed_token(key) + token = token.serialize() + + base_url = env.object_storage.base_url() + key = f"http://{base_url}/{tenant_id}/{timeline_id}/{endpoint_id}/key" + headers = {"Authorization": f"Bearer {token}"} + log.info(f"cache key url {key}") + log.info(f"token {token}") + + async with ClientSession(headers=headers) as session: + async with session.get(key) as res: + assert res.status == 404, f"Non-existing file is present: {res}" + + data = b"cheburash" + async with session.put(key, data=data) as res: + assert res.status == 200, f"Error writing file: {res}" + + async with session.get(key) as res: + read_data = await res.read() + assert data == read_data + + async with session.delete(key) as res: + assert res.status == 200, f"Error removing file {res}" + + async with session.get(key) as res: + assert res.status == 404, f"File was not deleted: {res}" diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index b292d08b60..2590a3fe9d 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -38,12 +38,13 @@ def get_num_downloaded_layers(client: PageserverHttpClient): This assumes that the pageserver only has a single tenant. """ value = client.get_metric_value( - "pageserver_remote_operation_seconds_count", + "pageserver_remote_timeline_client_seconds_global_count", { "file_kind": "layer", "op_kind": "download", "status": "success", }, + "sum", ) if value is None: return 0 diff --git a/test_runner/regress/test_page_service_batching_regressions.py b/test_runner/regress/test_page_service_batching_regressions.py index fa85e1210b..50303a4986 100644 --- a/test_runner/regress/test_page_service_batching_regressions.py +++ b/test_runner/regress/test_page_service_batching_regressions.py @@ -16,6 +16,7 @@ def test_slow_flush(neon_env_builder: NeonEnvBuilder, neon_binpath: Path, kind: "mode": "pipelined", "max_batch_size": 32, "execution": "concurrent-futures", + "batching": "uniform-lsn", } neon_env_builder.pageserver_config_override = patch_pageserver_toml diff --git a/test_runner/regress/test_pageserver_config.py b/test_runner/regress/test_pageserver_config.py new file mode 100644 index 0000000000..4035afd9aa --- /dev/null +++ b/test_runner/regress/test_pageserver_config.py @@ -0,0 +1,56 @@ +import re + +import pytest +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import run_only_on_default_postgres + + +@pytest.mark.parametrize("what", ["default", "top_level", "nested"]) +@run_only_on_default_postgres(reason="does not use postgres") +def test_unknown_config_items_handling(neon_simple_env: NeonEnv, what: str): + """ + Ensure we log unknown config fields and expose a metric for alerting. + There are more unit tests in the Rust code for other TOML items. + """ + env = neon_simple_env + + def edit_fn(config) -> str | None: + if what == "default": + return None + elif what == "top_level": + config["unknown_top_level_config_item"] = 23 + return r"unknown_top_level_config_item" + elif what == "nested": + config["remote_storage"]["unknown_config_item"] = 23 + return r"remote_storage.unknown_config_item" + else: + raise ValueError(f"Unknown what: {what}") + + def get_metric(): + metrics = env.pageserver.http_client().get_metrics() + samples = metrics.query_all("pageserver_config_ignored_items") + by_item = {sample.labels["item"]: sample.value for sample in samples} + assert by_item[""] == 0, "must always contain the empty item with value 0" + del by_item[""] + return by_item + + expected_ignored_item = env.pageserver.edit_config_toml(edit_fn) + + if expected_ignored_item is not None: + expected_ignored_item_log_line_re = r".*ignoring unknown configuration item.*" + re.escape( + expected_ignored_item + ) + env.pageserver.allowed_errors.append(expected_ignored_item_log_line_re) + + if expected_ignored_item is not None: + assert not env.pageserver.log_contains(expected_ignored_item_log_line_re) + assert get_metric() == {} + + # in any way, unknown config items should not fail pageserver to start + # TODO: extend this test with the config validator mode once we introduce it + # https://github.com/neondatabase/cloud/issues/24349 + env.pageserver.restart() + + if expected_ignored_item is not None: + assert env.pageserver.log_contains(expected_ignored_item_log_line_re) + assert get_metric() == {expected_ignored_item: 1} diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index c5d6650ca8..5ef63e2fe9 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -1,6 +1,5 @@ from __future__ import annotations -import copy import json import uuid from typing import TYPE_CHECKING @@ -43,7 +42,6 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P "refill_interval": "100ms", "refill_amount": int(rate_limit_rps / 10), "max": int(rate_limit_rps / 10), - "fair": True, }, }, ) @@ -97,17 +95,12 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None)) log.info("run pagebench") - duration_secs = 10 + duration_secs = 20 actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs) log.info("validate the client is capped at the configured rps limit") expect_ncompleted = duration_secs * rate_limit_rps - delta_abs = abs(expect_ncompleted - actual_ncompleted) - threshold = 0.05 * expect_ncompleted - assert threshold / rate_limit_rps < 0.1 * duration_secs, ( - "test self-test: unrealistic expecations regarding precision in this test" - ) - assert delta_abs < 0.05 * expect_ncompleted, ( + assert pytest.approx(expect_ncompleted, 0.05) == actual_ncompleted, ( "the throttling deviates more than 5percent from the expectation" ) @@ -121,6 +114,7 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P timeout=compaction_period, ) + log.info("validate the metrics") smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query) assert smgr_query_seconds_post is not None throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query) @@ -129,68 +123,13 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre actual_throttled_secs = actual_throttled_usecs / 1_000_000 - log.info("validate that the metric doesn't include throttle wait time") - assert duration_secs >= 10 * actual_smgr_query_seconds, ( - "smgr metrics should not include throttle wait time" - ) - - log.info("validate that the throttling wait time metrics is correct") assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, ( - "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates" + "throttling and processing latency = total request time; this assert validates thi holds on average" ) - -throttle_config_with_field_fair_set = { - "task_kinds": ["PageRequestHandler"], - "fair": True, - "initial": 27, - "refill_interval": "43s", - "refill_amount": 23, - "max": 42, -} - - -def assert_throttle_config_with_field_fair_set(conf): - """ - Field `fair` is ignored, so, responses don't contain it - """ - without_fair = copy.deepcopy(throttle_config_with_field_fair_set) - without_fair.pop("fair") - - assert conf == without_fair - - -def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder): - """ - To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out. - """ - env = neon_env_builder.init_start() - vps_http = env.storage_controller.pageserver_api() - # with_fair config should still be settable - vps_http.set_tenant_config( - env.initial_tenant, - {"timeline_get_throttle": throttle_config_with_field_fair_set}, + # without this assertion, the test would pass even if the throttling was completely broken + # but the request processing is so slow that it makes up for the latency that a correct throttling + # implementation would add + assert actual_smgr_query_seconds < 0.66 * duration_secs, ( + "test self-test: request processing is consuming most of the wall clock time; this risks that we're not actually testing throttling" ) - conf = vps_http.tenant_config(env.initial_tenant) - assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) - assert_throttle_config_with_field_fair_set( - conf.tenant_specific_overrides["timeline_get_throttle"] - ) - - -def test_throttle_fair_config_is_settable_but_ignored_in_config_toml( - neon_env_builder: NeonEnvBuilder, -): - """ - To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out. - """ - - def set_tenant_config(ps_cfg): - tenant_config = ps_cfg.setdefault("tenant_config", {}) - tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set - - neon_env_builder.pageserver_config_override = set_tenant_config - env = neon_env_builder.init_start() - ps_http = env.pageserver.http_client() - conf = ps_http.tenant_config(env.initial_tenant) - assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index f80edced5c..acec0ba44a 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -107,7 +107,7 @@ def test_metric_collection( ps_metrics = env.pageserver.http_client().get_metrics() total = 0.0 for sample in ps_metrics.query_all( - name="pageserver_remote_operation_seconds_count", + name="pageserver_remote_timeline_client_seconds_global_count", filter={ "file_kind": str(file_kind), "op_kind": str(op_kind), diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 3749df2229..d48e731394 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -61,7 +61,7 @@ def evict_random_layers( ) client = pageserver.http_client() for layer in initial_local_layers: - if "ephemeral" in layer.name or "temp_download" in layer.name: + if "ephemeral" in layer.name or "temp_download" in layer.name or ".___temp" in layer.name: continue layer_name = parse_layer_file_name(layer.name) @@ -242,7 +242,13 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, pageserver.tenant_location_configure(tenant_id, location_conf) last_state[pageserver.id] = (mode, generation) - if mode.startswith("Attached"): + # It's only valid to connect to the last generation. Newer generations may yank layer + # files used in older generations. + last_generation = max( + [s[1] for s in last_state.values() if s[1] is not None], default=None + ) + + if mode.startswith("Attached") and generation == last_generation: # This is a basic test: we are validating that he endpoint works properly _between_ # configuration changes. A stronger test would be to validate that clients see # no errors while we are making the changes. @@ -1099,3 +1105,70 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): # Warm up the current secondary. ps_attached.http_client().tenant_secondary_download(tenant_id, wait_ms=100) wait_until(lambda: all_layers_downloaded(ps_secondary, expected_locally)) + + +@run_only_on_default_postgres("PG version is not interesting here") +@pytest.mark.parametrize("action", ["delete_timeline", "detach"]) +def test_io_metrics_match_secondary_timeline_lifecycle( + neon_env_builder: NeonEnvBuilder, action: str +): + """ + Check that IO metrics for secondary timelines are de-registered when the timeline + is removed + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + parent_timeline_id = TimelineId.generate() + + # We do heatmap uploads and pulls manually + tenant_conf = {"heatmap_period": "0s"} + env.create_tenant( + tenant_id, parent_timeline_id, conf=tenant_conf, placement_policy='{"Attached":1}' + ) + + child_timeline_id = env.create_branch("foo", tenant_id) + + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + status, _ = ps_secondary.http_client().tenant_secondary_download(tenant_id, wait_ms=5000) + assert status == 200 + + labels = { + "operation": "write", + "tenant_id": str(tenant_id), + "timeline_id": str(child_timeline_id), + } + bytes_written = ( + ps_secondary.http_client() + .get_metrics() + .query_one("pageserver_io_operations_bytes_total", labels) + .value + ) + + assert bytes_written == 0 + + if action == "delete_timeline": + env.storage_controller.pageserver_api().timeline_delete(tenant_id, child_timeline_id) + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + status, _ = ps_secondary.http_client().tenant_secondary_download(tenant_id, wait_ms=5000) + assert status == 200 + elif action == "detach": + env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}}) + env.storage_controller.reconcile_until_idle() + else: + raise Exception("Unexpected action") + + assert ( + len( + ps_secondary.http_client() + .get_metrics() + .query_all("pageserver_io_operations_bytes_total", labels) + ) + == 0 + ) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index a3fae97327..0fea706888 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -239,6 +239,8 @@ def test_isolation( "neon.regress_test_mode = true", # Stack size should be increased for tests to pass with asan. "max_stack_depth = 4MB", + # Neon extensiosn starts 2 BGW so decreasing number of parallel workers which can affect deadlock-parallel test if it hits max_worker_processes. + "max_worker_processes = 16", ], ) endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py index 17819fd367..1ebf70dbf2 100644 --- a/test_runner/regress/test_physical_replication.py +++ b/test_runner/regress/test_physical_replication.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import wait_replica_caughtup +from fixtures.utils import shared_buffers_for_max_cu if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnv @@ -180,7 +181,8 @@ def test_physical_replication_config_mismatch_too_many_known_xids(neon_simple_en endpoint_id="primary", config_lines=[ "max_connections=1000", - "shared_buffers=128MB", # prevent "no unpinned buffers available" error + # use shared_buffers size like in production for 2 CU compute + f"shared_buffers={shared_buffers_for_max_cu(2.0)}", # prevent "no unpinned buffers available" error ], ) secondary = env.endpoints.new_replica_start( diff --git a/test_runner/regress/test_ssl.py b/test_runner/regress/test_ssl.py index 9a7204ca17..39c94c05a9 100644 --- a/test_runner/regress/test_ssl.py +++ b/test_runner/regress/test_ssl.py @@ -1,5 +1,6 @@ import os import ssl +from datetime import datetime, timedelta import pytest import requests @@ -151,3 +152,63 @@ def test_certificate_rotation(neon_env_builder: NeonEnvBuilder): requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() cur_cert = ssl.get_server_certificate(("localhost", port)) assert cur_cert == sk_cert + + +def test_server_and_cert_metrics(neon_env_builder: NeonEnvBuilder): + """ + Test metrics exported from http/https server and tls cert reloader. + """ + neon_env_builder.use_https_pageserver_api = True + neon_env_builder.pageserver_config_override = "ssl_cert_reload_period='100 ms'" + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.append(".*Error reloading certificate.*") + + ps_client = env.pageserver.http_client() + + # 1. Test connection started metric. + filter_https = {"scheme": "https"} + old_https_conn_count = ( + ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0 + ) + + addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status" + requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() + + new_https_conn_count = ( + ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0 + ) + # The counter should increase after the request, + # but it may increase by more than one because of storcon requests. + assert new_https_conn_count > old_https_conn_count + + # 2. Test tls connection error. + # Request without specified CA cert file should fail. + with pytest.raises(requests.exceptions.SSLError): + requests.get(addr) + + tls_error_cnt = ( + ps_client.get_metric_value("http_server_connection_errors_total", {"type": "tls"}) or 0 + ) + assert tls_error_cnt == 1 + + # 3. Test expiration time metric. + expiration_time = datetime.fromtimestamp( + ps_client.get_metric_value("tls_certs_expiration_time_seconds") or 0 + ) + now = datetime.now() + # neon_local generates certs valid for 100 years. + # Compare with +-1 year to not care about leap years. + assert now + timedelta(days=365 * 99) < expiration_time < now + timedelta(days=365 * 101) + + # 4. Test cert reload failed metric. + reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") + assert reload_error_cnt == 0 + + os.remove(env.pageserver.workdir / "server.crt") + + def reload_failed(): + reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") or 0 + assert reload_error_cnt > 0 + + wait_until(reload_failed) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 35a75ca607..b2c8415e9a 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -95,6 +95,7 @@ def test_storage_controller_smoke( env.pageservers[1].start() for sk in env.safekeepers: sk.start() + env.object_storage.start() # The pageservers we started should have registered with the sharding service on startup nodes = env.storage_controller.node_list() @@ -346,6 +347,7 @@ def prepare_onboarding_env( env = neon_env_builder.init_configs() env.broker.start() env.storage_controller.start() + env.object_storage.start() # This is the pageserver where we'll initially create the tenant. Run it in emergency # mode so that it doesn't talk to storage controller, and do not register it. @@ -675,7 +677,7 @@ def test_storage_controller_compute_hook( env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2) expect = { "tenant_id": str(env.initial_tenant), - "stripe_size": 32768, + "stripe_size": 2048, "shards": [ {"node_id": int(env.pageservers[1].id), "shard_number": 0}, {"node_id": int(env.pageservers[1].id), "shard_number": 1}, @@ -2890,10 +2892,12 @@ def test_storage_controller_leadership_transfer( ) +@pytest.mark.parametrize("step_down_times_out", [False, True]) def test_storage_controller_leadership_transfer_during_split( neon_env_builder: NeonEnvBuilder, storage_controller_proxy: StorageControllerProxy, port_distributor: PortDistributor, + step_down_times_out: bool, ): """ Exercise a race between shard splitting and graceful leadership transfer. This is @@ -2934,6 +2938,18 @@ def test_storage_controller_leadership_transfer_during_split( ) env.storage_controller.reconcile_until_idle() + # We are testing scenarios where the step down API does not complete: either because it is stuck + # doing a shard split, or because it totally times out on some other failpoint. + env.storage_controller.allowed_errors.extend( + [ + ".*step_down.*request was dropped before completing.*", + ".*step_down.*operation timed out.*", + ".*Send step down request failed, will retry.*", + ".*Send step down request still failed after.*retries.*", + ".*Leader .+ did not respond to step-down request.*", + ] + ) + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: # Start a shard split env.storage_controller.allowed_errors.extend( @@ -2941,6 +2957,14 @@ def test_storage_controller_leadership_transfer_during_split( ) pause_failpoint = "shard-split-pre-complete" env.storage_controller.configure_failpoints((pause_failpoint, "pause")) + + if not step_down_times_out: + # Prevent the timeout self-terminate code from executing: we will block step down on the + # shard split itself + env.storage_controller.configure_failpoints( + ("step-down-delay-timeout", "return(3600000)") + ) + split_fut = executor.submit( env.storage_controller.tenant_shard_split, list(tenants)[0], shard_count * 2 ) @@ -2959,12 +2983,20 @@ def test_storage_controller_leadership_transfer_during_split( timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port ) + if step_down_times_out: + # Step down will time out, original controller will terminate itself + env.storage_controller.allowed_errors.extend([".*terminating process.*"]) + else: + # Step down does not time out: original controller hits its shard split completion + # code path and realises that it must not purge the parent shards from the database. + env.storage_controller.allowed_errors.extend([".*Enqueuing background abort.*"]) + def passed_split_abort(): try: log.info("Checking log for pattern...") - assert env.storage_controller.log_contains( - ".*Using observed state received from leader.*" - ) + # This log is indicative of entering startup_reconcile, which happens + # after the point we would abort shard splits + assert env.storage_controller.log_contains(".*Populating tenant shards.*") except Exception: log.exception("Failed to find pattern in log") raise @@ -2973,34 +3005,42 @@ def test_storage_controller_leadership_transfer_during_split( wait_until(passed_split_abort, interval=0.1, status_interval=1.0) assert env.storage_controller.log_contains(".*Aborting shard split.*") - # Proxy is still talking to original controller here: disable its pause failpoint so - # that its shard split can run to completion. - log.info("Disabling failpoint") - # Bypass the proxy: the python test HTTPServer is single threaded and still blocked - # on handling the shard split request. - env.storage_controller.request( - "PUT", - f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints", - json=[{"name": "shard-split-pre-complete", "actions": "off"}], - headers=env.storage_controller.headers(TokenScope.ADMIN), - ) + if step_down_times_out: + # We will let the old controller hit a timeout path where it terminates itself, rather than + # completing step_down and trying to complete a shard split + def old_controller_terminated(): + assert env.storage_controller.log_contains(".*terminating process.*") - def previous_stepped_down(): - assert ( - env.storage_controller.get_leadership_status() - == StorageControllerLeadershipStatus.STEPPED_DOWN + wait_until(old_controller_terminated) + else: + # Proxy is still talking to original controller here: disable its pause failpoint so + # that its shard split can run to completion. + log.info("Disabling failpoint") + # Bypass the proxy: the python test HTTPServer is single threaded and still blocked + # on handling the shard split request. + env.storage_controller.request( + "PUT", + f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints", + json=[{"name": "shard-split-pre-complete", "actions": "off"}], + headers=env.storage_controller.headers(TokenScope.ADMIN), ) - log.info("Awaiting step down") - wait_until(previous_stepped_down) + def previous_stepped_down(): + assert ( + env.storage_controller.get_leadership_status() + == StorageControllerLeadershipStatus.STEPPED_DOWN + ) - # Let the shard split complete: this may happen _after_ the replacement has come up - # and tried to clean up the databases - log.info("Unblocking & awaiting shard split") - with pytest.raises(Exception, match="Unexpected child shard count"): - # This split fails when it tries to persist results, because it encounters - # changes already made by the new controller's abort-on-startup - split_fut.result() + log.info("Awaiting step down") + wait_until(previous_stepped_down) + + # Let the shard split complete: this may happen _after_ the replacement has come up + # and tried to clean up the databases + log.info("Unblocking & awaiting shard split") + with pytest.raises(Exception, match="Unexpected child shard count"): + # This split fails when it tries to persist results, because it encounters + # changes already made by the new controller's abort-on-startup + split_fut.result() log.info("Routing to new leader") storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}") @@ -3018,13 +3058,14 @@ def test_storage_controller_leadership_transfer_during_split( env.storage_controller.wait_until_ready() env.storage_controller.consistency_check() - # Check that the stepped down instance forwards requests - # to the new leader while it's still running. - storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") - env.storage_controller.tenant_shard_dump() - env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) - status = env.storage_controller.node_status(env.pageservers[0].id) - assert status["scheduling"] == "Pause" + if not step_down_times_out: + # Check that the stepped down instance forwards requests + # to the new leader while it's still running. + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") + env.storage_controller.tenant_shard_dump() + env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) + status = env.storage_controller.node_status(env.pageservers[0].id) + assert status["scheduling"] == "Pause" def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder): @@ -4073,6 +4114,102 @@ def test_storage_controller_location_conf_equivalence(neon_env_builder: NeonEnvB assert reconciles_after_restart == 0 +@run_only_on_default_postgres("PG version is not interesting here") +@pytest.mark.parametrize("restart_storcon", [True, False]) +def test_storcon_create_delete_sk_down(neon_env_builder: NeonEnvBuilder, restart_storcon: bool): + """ + Test that the storcon can create and delete tenants and timelines with a safekeeper being down. + - restart_storcon: tests whether the pending ops are persisted. + if we don't restart, we test that we don't require it to come from the db. + """ + + neon_env_builder.num_safekeepers = 3 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + } + env = neon_env_builder.init_start() + + env.safekeepers[0].stop() + + # Wait for heartbeater to pick up that the safekeeper is gone + # This isn't really neccessary + def logged_offline(): + env.storage_controller.assert_log_contains( + "Heartbeat round complete for 3 safekeepers, 1 offline" + ) + + wait_until(logged_offline) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.create_tenant(tenant_id, timeline_id) + + env.safekeepers[1].assert_log_contains(f"creating new timeline {tenant_id}/{timeline_id}") + env.safekeepers[2].assert_log_contains(f"creating new timeline {tenant_id}/{timeline_id}") + + env.storage_controller.allowed_errors.extend( + [ + ".*Call to safekeeper.* management API still failed after.*", + ".*Call to safekeeper.* management API failed, will retry.*", + ".*reconcile_one.*tenant_id={tenant_id}.*Call to safekeeper.* management API still failed after.*", + ] + ) + + if restart_storcon: + # Restart the storcon to check that we persist operations + env.storage_controller.stop() + env.storage_controller.start() + + config_lines = [ + "neon.safekeeper_proto_version = 3", + ] + with env.endpoints.create("main", tenant_id=tenant_id, config_lines=config_lines) as ep: + # endpoint should start. + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") + + env.storage_controller.assert_log_contains("writing pending op for sk id 1") + env.safekeepers[0].start() + + # ensure that we applied the operation also for the safekeeper we just brought down + def logged_contains_on_sk(): + env.safekeepers[0].assert_log_contains( + f"pulling timeline {tenant_id}/{timeline_id} from safekeeper" + ) + + wait_until(logged_contains_on_sk) + + env.safekeepers[1].stop() + + env.storage_controller.pageserver_api().tenant_delete(tenant_id) + + # ensure the safekeeper deleted the timeline + def timeline_deleted_on_active_sks(): + env.safekeepers[0].assert_log_contains( + f"deleting timeline {tenant_id}/{timeline_id} from disk" + ) + env.safekeepers[2].assert_log_contains( + f"deleting timeline {tenant_id}/{timeline_id} from disk" + ) + + wait_until(timeline_deleted_on_active_sks) + + if restart_storcon: + # Restart the storcon to check that we persist operations + env.storage_controller.stop() + env.storage_controller.start() + + env.safekeepers[1].start() + + # ensure that there is log msgs for the third safekeeper too + def timeline_deleted_on_sk(): + env.safekeepers[1].assert_log_contains( + f"deleting timeline {tenant_id}/{timeline_id} from disk" + ) + + wait_until(timeline_deleted_on_sk) + + @pytest.mark.parametrize("wrong_az", [True, False]) def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder, wrong_az: bool): """ @@ -4176,3 +4313,121 @@ def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder, ) else: assert initial_ps.http_client().tenant_list_locations()["tenant_shards"] == [] + + +@run_only_on_default_postgres("this is like a 'unit test' against storcon db") +def test_storage_controller_migrate_with_pageserver_restart( + neon_env_builder: NeonEnvBuilder, make_httpserver +): + """ + Test that live migrations which fail right after incrementing the generation + due to the destination going offline eventually send a compute notification + after the destination re-attaches. + """ + neon_env_builder.num_pageservers = 2 + + neon_env_builder.storage_controller_config = { + # Disable transitions to offline + "max_offline": "600s", + "use_local_compute_notifications": False, + } + + neon_env_builder.control_plane_hooks_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/" + ) + + notifications = [] + + def notify(request: Request): + log.info(f"Received notify-attach: {request}") + notifications.append(request.json) + + make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(notify) + + env = neon_env_builder.init_start() + + env.storage_controller.allowed_errors.extend( + [ + ".*Call to node.*management API failed.*", + ".*Call to node.*management API still failed.*", + ".*Reconcile error.*", + ".*request.*PUT.*migrate.*", + ] + ) + + env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}}) + env.storage_controller.reconcile_until_idle() + + initial_desc = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0] + log.info(f"{initial_desc=}") + primary = env.get_pageserver(initial_desc["node_attached"]) + secondary = env.get_pageserver(initial_desc["node_secondary"][0]) + + # Pause the migration after incrementing the generation in the database + env.storage_controller.configure_failpoints( + ("reconciler-live-migrate-post-generation-inc", "pause") + ) + + tenant_shard_id = TenantShardId(env.initial_tenant, 0, 0) + + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + migrate_fut = executor.submit( + env.storage_controller.tenant_shard_migrate, + tenant_shard_id, + secondary.id, + config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True), + ) + + def has_hit_migration_failpoint(): + expr = "at failpoint reconciler-live-migrate-post-generation-inc" + log.info(expr) + assert env.storage_controller.log_contains(expr) + + wait_until(has_hit_migration_failpoint) + + secondary.stop() + + # Eventually migration completes + env.storage_controller.configure_failpoints( + ("reconciler-live-migrate-post-generation-inc", "off") + ) + try: + migrate_fut.result() + except StorageControllerApiException as err: + log.info(f"Migration failed: {err}") + except: + env.storage_controller.configure_failpoints( + ("reconciler-live-migrate-post-generation-inc", "off") + ) + raise + + def process_migration_result(): + dump = env.storage_controller.tenant_shard_dump() + observed = dump[0]["observed"]["locations"] + + log.info(f"{observed=} primary={primary.id} secondary={secondary.id}") + + assert observed[str(primary.id)]["conf"]["mode"] == "AttachedStale" + assert observed[str(secondary.id)]["conf"] is None + + wait_until(process_migration_result) + + # Start and wait for re-attach to be processed + secondary.start() + env.storage_controller.poll_node_status( + secondary.id, + desired_availability=PageserverAvailability.ACTIVE, + desired_scheduling_policy=None, + max_attempts=10, + backoff=1, + ) + + env.storage_controller.reconcile_until_idle() + + assert notifications[-1] == { + "tenant_id": str(env.initial_tenant), + "stripe_size": None, + "shards": [{"node_id": int(secondary.id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, + } diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 70af299de3..03cd133ccb 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -75,7 +75,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)] # Let shards finish rescheduling to other pageservers: this makes the rest of the test more stable - # is it won't overlap with migrations + # as it won't overlap with migrations env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) output_path = neon_env_builder.test_output_dir / "snapshot" @@ -87,6 +87,13 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: workload.stop() + # Disable scheduling, so the storage controller doesn't migrate shards around + # while we are stopping pageservers + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"}) + env.storage_controller.allowed_errors.extend( + [".*Scheduling is disabled by policy Stop.*", ".*Skipping reconcile for policy Stop.*"] + ) + # Stop pageservers for pageserver in env.pageservers: pageserver.stop() @@ -127,9 +134,16 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: for pageserver in env.pageservers: pageserver.start() + # Turn scheduling back on. + # We don't care about optimizations, so enable only essential scheduling + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Essential"}) + # Check we can read everything workload.validate() + # Reconcile to avoid a race between test shutdown and background reconciliation (#11278) + env.storage_controller.reconcile_until_idle() + def drop_local_state(env: NeonEnv, tenant_id: TenantId): env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py index 6175643389..83bebc19be 100644 --- a/test_runner/regress/test_subscriber_branching.py +++ b/test_runner/regress/test_subscriber_branching.py @@ -251,7 +251,7 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv): NUMBER_OF_DBS = 5 # Create and start endpoint so that neon_local put all the generated - # stuff into the spec.json file. + # stuff into the config.json file. endpoint = env.endpoints.create_start( "main", config_lines=[ @@ -280,13 +280,15 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv): } ) - # Update the spec.json file to create the databases + # Update the config.json file to create the databases # and reconfigure the endpoint to apply the changes. endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "databases": TEST_DB_NAMES, + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES, + }, }, } ) diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index a50a1beed6..190dd914ee 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -757,6 +757,47 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, env.stop(immediate=True) +def test_lsn_lease_storcon(neon_env_builder: NeonEnvBuilder): + conf = { + "pitr_interval": "0s", + "gc_period": "0s", + "compaction_period": "0s", + } + env = neon_env_builder.init_start(initial_tenant_conf=conf) + with env.endpoints.create_start( + "main", + ) as ep: + with ep.cursor() as cur: + cur.execute( + "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + last_flush_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + env.storage_controller.pageserver_api().timeline_lsn_lease( + env.initial_tenant, env.initial_timeline, last_flush_lsn + ) + env.storage_controller.tenant_shard_split(env.initial_tenant, 8) + env.storage_controller.reconcile_until_idle(timeout_secs=120) + # TODO: do we preserve LSN leases across shard splits? + env.storage_controller.pageserver_api().timeline_lsn_lease( + env.initial_tenant, env.initial_timeline, last_flush_lsn + ) + + +def test_mark_invisible_storcon(neon_env_builder: NeonEnvBuilder): + conf = { + "pitr_interval": "0s", + "gc_period": "0s", + "compaction_period": "0s", + } + env = neon_env_builder.init_start(initial_tenant_conf=conf) + env.storage_controller.pageserver_api().timeline_mark_invisible( + env.initial_tenant, env.initial_timeline + ) + env.storage_controller.pageserver_api().timeline_mark_invisible( + env.initial_tenant, env.initial_timeline, True + ) + + def insert_with_action( env: NeonEnv, tenant: TenantId, diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index c613a79374..c00f8f4ca5 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -390,6 +390,7 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): # Tenant creation requests which arrive out of order will generate complaints about # generation nubmers out of order. env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+") + env.pageserver.allowed_errors.append(".*due to stale generation.+") # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of # an incomplete attach, or some other problem. In the field this should be rare, diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 17abe1ea75..4360b42d68 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -318,7 +318,7 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel neon_env_builder.pageserver_remote_storage, prefix=f"tenants/{str(tenant_id)}/", ) - assert_prefix_empty( + assert_prefix_not_empty( neon_env_builder.pageserver_remote_storage, prefix=f"tenants/{str(tenant_id)}/tenant-manifest", ) @@ -387,7 +387,7 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500") assert sum == sum_again - assert_prefix_empty( + assert_prefix_not_empty( neon_env_builder.pageserver_remote_storage, prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest", ) @@ -924,7 +924,7 @@ def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_remote_storage, prefix=f"tenants/{str(tenant_id)}/", ) - assert_prefix_empty( + assert_prefix_not_empty( neon_env_builder.pageserver_remote_storage, prefix=f"tenants/{str(tenant_id)}/tenant-manifest", ) diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 2a916438e5..a71652af8a 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -343,7 +343,8 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline) -def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("snapshots_archived", ["archived", "normal"]) +def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots_archived: str): """ Test the v2 behavior of ancestor detach. @@ -385,6 +386,11 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder): ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + branchpoint_y = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + branchpoint_x = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) @@ -395,6 +401,10 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder): "earlier", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_pipe ) + snapshot_branchpoint_old = env.create_branch( + "snapshot_branchpoint_old", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_y + ) + snapshot_branchpoint = env.create_branch( "snapshot_branchpoint", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_x ) @@ -407,19 +417,32 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder): after = env.create_branch("after", ancestor_branch_name="main", ancestor_start_lsn=None) + if snapshots_archived == "archived": + # archive the previous snapshot branchpoint + client.timeline_archival_config( + env.initial_tenant, snapshot_branchpoint_old, TimelineArchivalState.ARCHIVED + ) + all_reparented = client.detach_ancestor( env.initial_tenant, branch_to_detach, detach_behavior="v2" ) assert set(all_reparented) == set() + if snapshots_archived == "archived": + # restore the branchpoint so that we can query from the endpoint + client.timeline_archival_config( + env.initial_tenant, snapshot_branchpoint_old, TimelineArchivalState.UNARCHIVED + ) + env.pageserver.quiesce_tenants() # checking the ancestor after is much faster than waiting for the endpoint not start expected_result = [ - ("main", env.initial_timeline, None, 16384, 1), - ("after", after, env.initial_timeline, 16384, 1), - ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 8192, 1), - ("branch_to_detach", branch_to_detach, None, 8192, 1), + ("main", env.initial_timeline, None, 24576, 1), + ("after", after, env.initial_timeline, 24576, 1), + ("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1), + ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1), + ("branch_to_detach", branch_to_detach, None, 16384, 1), ("earlier", earlier, env.initial_timeline, 0, 1), ] @@ -1745,6 +1768,87 @@ def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBu workload_child.validate(env.pageserver.id) +def test_timeline_detach_with_aux_files_with_detach_v1( + neon_env_builder: NeonEnvBuilder, +): + """ + Validate that "branches do not inherit their parent" is invariant over detach_ancestor. + + Branches hide parent branch aux files etc by stopping lookup of non-inherited keyspace at the parent-child boundary. + We had a bug where detach_ancestor running on a child branch would copy aux files key range from child to parent, + thereby making parent aux files reappear. + """ + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + } + ) + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + http = env.pageserver.http_client() + + endpoint = env.endpoints.create_start("main", tenant_id=env.initial_tenant) + lsn0 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + endpoint.safe_psql( + "SELECT pg_create_logical_replication_slot('test_slot_parent_1', 'pgoutput')" + ) + lsn1 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + endpoint.safe_psql( + "SELECT pg_create_logical_replication_slot('test_slot_parent_2', 'pgoutput')" + ) + lsn2 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn0).keys()) == set( + [] + ) + assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn1).keys()) == set( + ["pg_replslot/test_slot_parent_1/state"] + ) + assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set( + ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"] + ) + + # Restore at LSN1 + branch_timeline_id = env.create_branch("restore", env.initial_tenant, "main", lsn1) + endpoint2 = env.endpoints.create_start("restore", tenant_id=env.initial_tenant) + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) + + # Add a new slot file to the restore branch (This won't happen in reality because cplane immediately detaches the branch on restore, + # but we want to ensure that aux files on the detached branch are NOT inherited during ancestor detach. We could change the behavior + # in the future. + # TL;DR we should NEVER automatically detach a branch as a background optimization for those tenants that already used the restore + # feature before branch detach was introduced because it will clean up the aux files and stop logical replication. + endpoint2.safe_psql( + "SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')" + ) + lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set( + ["pg_replslot/test_slot_restore/state"] + ) + + print("lsn0=", lsn0) + print("lsn1=", lsn1) + print("lsn2=", lsn2) + print("lsn3=", lsn3) + # Detach the restore branch so that main doesn't have any child branches. + all_reparented = http.detach_ancestor( + env.initial_tenant, branch_timeline_id, detach_behavior="v1" + ) + assert all_reparented == set([]) + + # We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN. + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set( + ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"] + ), "main branch unaffected" + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set( + ["pg_replslot/test_slot_restore/state"] + ) + assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) + + # TODO: # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general. diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index e3d39f9315..a9a6699e5c 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -45,7 +45,7 @@ from fixtures.remote_storage import ( s3_storage, ) from fixtures.safekeeper.http import ( - Configuration, + MembershipConfiguration, SafekeeperHttpClient, SafekeeperId, TimelineCreateRequest, @@ -589,7 +589,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re for sk in env.safekeepers: sk.start() cli = sk.http_client() - mconf = Configuration(generation=0, members=[], new_members=None) + mconf = MembershipConfiguration(generation=0, members=[], new_members=None) # set start_lsn to the beginning of the first segment to allow reading # WAL from there (could you intidb LSN as well). r = TimelineCreateRequest( @@ -1948,7 +1948,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): sk_id_2 = SafekeeperId(11, "localhost", 5434) # just a mock # Request to switch before timeline creation should fail. - init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None) + init_conf = MembershipConfiguration(generation=1, members=[sk_id_1], new_members=None) with pytest.raises(requests.exceptions.HTTPError): http_cli.membership_switch(tenant_id, timeline_id, init_conf) @@ -1960,7 +1960,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): http_cli.timeline_create(create_r) # Switch into some conf. - joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2]) + joint_conf = MembershipConfiguration(generation=4, members=[sk_id_1], new_members=[sk_id_2]) resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf) log.info(f"joint switch resp: {resp}") assert resp.previous_conf.generation == 1 @@ -1973,24 +1973,26 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): assert after_restart.generation == 4 # Switch into non joint conf of which sk is not a member, must fail. - non_joint_not_member = Configuration(generation=5, members=[sk_id_2], new_members=None) + non_joint_not_member = MembershipConfiguration( + generation=5, members=[sk_id_2], new_members=None + ) with pytest.raises(requests.exceptions.HTTPError): resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint_not_member) # Switch into good non joint conf. - non_joint = Configuration(generation=6, members=[sk_id_1], new_members=None) + non_joint = MembershipConfiguration(generation=6, members=[sk_id_1], new_members=None) resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint) log.info(f"non joint switch resp: {resp}") assert resp.previous_conf.generation == 4 assert resp.current_conf.generation == 6 # Switch request to lower conf should be rejected. - lower_conf = Configuration(generation=3, members=[sk_id_1], new_members=None) + lower_conf = MembershipConfiguration(generation=3, members=[sk_id_1], new_members=None) with pytest.raises(requests.exceptions.HTTPError): http_cli.membership_switch(tenant_id, timeline_id, lower_conf) # Now, exclude sk from the membership, timeline should be deleted. - excluded_conf = Configuration(generation=7, members=[sk_id_2], new_members=None) + excluded_conf = MembershipConfiguration(generation=7, members=[sk_id_2], new_members=None) http_cli.timeline_exclude(tenant_id, timeline_id, excluded_conf) with pytest.raises(requests.exceptions.HTTPError): http_cli.timeline_status(tenant_id, timeline_id) @@ -2010,11 +2012,6 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.initial_timeline - ps = env.pageservers[0] - ps_http_cli = ps.http_client() - - http_clis = [sk.http_client() for sk in env.safekeepers] - config_lines = [ "neon.safekeeper_proto_version = 3", ] @@ -2023,22 +2020,11 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): # expected to fail because timeline is not created on safekeepers with pytest.raises(Exception, match=r".*timed out.*"): ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s") - # figure out initial LSN. - ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id) - init_lsn = ps_timeline_detail["last_record_lsn"] - log.info(f"initial LSN: {init_lsn}") - # sk timeline creation request expects minor version - pg_version = ps_timeline_detail["pg_version"] * 10000 # create inital mconf - sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers] - mconf = Configuration(generation=1, members=sk_ids, new_members=None) - create_r = TimelineCreateRequest( - tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None + mconf = MembershipConfiguration( + generation=1, members=Safekeeper.sks_to_safekeeper_ids(env.safekeepers), new_members=None ) - log.info(f"sending timeline create: {create_r.to_json()}") - - for sk_http_cli in http_clis: - sk_http_cli.timeline_create(create_r) + Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, env.safekeepers) # Once timeline created endpoint should start. ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index b7c7478e78..c5dd34f64f 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -18,6 +18,7 @@ from fixtures.neon_fixtures import ( Safekeeper, ) from fixtures.remote_storage import RemoteStorageKind +from fixtures.safekeeper.http import MembershipConfiguration from fixtures.utils import skip_in_debug_build if TYPE_CHECKING: @@ -452,20 +453,24 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder): asyncio.run(run_concurrent_computes(env)) +async def assert_query_hangs(endpoint: Endpoint, query: str): + """ + Start on endpoint query which is expected to hang and check that it does. + """ + conn = await endpoint.connect_async() + bg_query = asyncio.create_task(conn.execute(query)) + await asyncio.sleep(2) + assert not bg_query.done() + return bg_query + + # Stop safekeeper and check that query cannot be executed while safekeeper is down. # Query will insert a single row into a table. -async def check_unavailability( - sk: Safekeeper, conn: asyncpg.Connection, key: int, start_delay_sec: int = 2 -): +async def check_unavailability(sk: Safekeeper, ep: Endpoint, key: int, start_delay_sec: int = 2): # shutdown one of two acceptors, that is, majority sk.stop() - bg_query = asyncio.create_task(conn.execute(f"INSERT INTO t values ({key}, 'payload')")) - - await asyncio.sleep(start_delay_sec) - # ensure that the query has not been executed yet - assert not bg_query.done() - + bg_query = await assert_query_hangs(ep, f"INSERT INTO t values ({key}, 'payload')") # start safekeeper and await the query sk.start() await bg_query @@ -480,10 +485,10 @@ async def run_unavailability(env: NeonEnv, endpoint: Endpoint): await conn.execute("INSERT INTO t values (1, 'payload')") # stop safekeeper and check that query cannot be executed while safekeeper is down - await check_unavailability(env.safekeepers[0], conn, 2) + await check_unavailability(env.safekeepers[0], endpoint, 2) # for the world's balance, do the same with second safekeeper - await check_unavailability(env.safekeepers[1], conn, 3) + await check_unavailability(env.safekeepers[1], endpoint, 3) # check that we can execute queries after restart await conn.execute("INSERT INTO t values (4, 'payload')") @@ -514,15 +519,7 @@ async def run_recovery_uncommitted(env: NeonEnv): # insert with only one safekeeper up to create tail of flushed but not committed WAL sk1.stop() sk2.stop() - conn = await ep.connect_async() - # query should hang, so execute in separate task - bg_query = asyncio.create_task( - conn.execute("insert into t select generate_series(1, 2000), 'payload'") - ) - sleep_sec = 2 - await asyncio.sleep(sleep_sec) - # it must still be not finished - assert not bg_query.done() + await assert_query_hangs(ep, "insert into t select generate_series(1, 2000), 'payload'") # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers. ep.stop_and_destroy() @@ -559,15 +556,7 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int): # insert with only one sk3 up to create tail of flushed but not committed WAL on it sk1.stop() sk2.stop() - conn = await ep.connect_async() - # query should hang, so execute in separate task - bg_query = asyncio.create_task( - conn.execute("insert into t select generate_series(1, 180000), 'Papaya'") - ) - sleep_sec = 2 - await asyncio.sleep(sleep_sec) - # it must still be not finished - assert not bg_query.done() + await assert_query_hangs(ep, "insert into t select generate_series(1, 180000), 'Papaya'") # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers. ep.stop_and_destroy() @@ -607,6 +596,132 @@ def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_versi asyncio.run(run_wal_truncation(env, safekeeper_proto_version)) +async def quorum_sanity_single( + env: NeonEnv, + compute_sks_ids: list[int], + members_sks_ids: list[int], + new_members_sks_ids: list[int] | None, + sks_to_stop_ids: list[int], + should_work_when_stopped: bool, +): + """ + *_ids params contain safekeeper node ids; it is assumed they are issued + from 1 and sequentially assigned to env.safekeepers. + """ + members_sks = [env.safekeepers[i - 1] for i in members_sks_ids] + new_members_sks = ( + [env.safekeepers[i - 1] for i in new_members_sks_ids] if new_members_sks_ids else None + ) + sks_to_stop = [env.safekeepers[i - 1] for i in sks_to_stop_ids] + + mconf = MembershipConfiguration( + generation=1, + members=Safekeeper.sks_to_safekeeper_ids(members_sks), + new_members=Safekeeper.sks_to_safekeeper_ids(new_members_sks) if new_members_sks else None, + ) + members_sks = Safekeeper.mconf_sks(env, mconf) + + tenant_id = env.initial_tenant + compute_sks_ids_str = "-".join([str(sk_id) for sk_id in compute_sks_ids]) + members_sks_ids_str = "-".join([str(sk.id) for sk in mconf.members]) + new_members_sks_ids_str = "-".join( + [str(sk.id) for sk in mconf.new_members] if mconf.new_members is not None else [] + ) + sks_to_stop_ids_str = "-".join([str(sk.id) for sk in sks_to_stop]) + log.info( + f"running quorum_sanity_single with compute_sks={compute_sks_ids_str}, members_sks={members_sks_ids_str}, new_members_sks={new_members_sks_ids_str}, sks_to_stop={sks_to_stop_ids_str}, should_work_when_stopped={should_work_when_stopped}" + ) + branch_name = f"test_quorum_single_c{compute_sks_ids_str}_m{members_sks_ids_str}_{new_members_sks_ids_str}_s{sks_to_stop_ids_str}" + timeline_id = env.create_branch(branch_name) + + # create timeline on `members_sks` + Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, members_sks) + + config_lines = [ + "neon.safekeeper_proto_version = 3", + ] + ep = env.endpoints.create(branch_name, config_lines=config_lines) + ep.start(safekeeper_generation=1, safekeepers=compute_sks_ids) + ep.safe_psql("create table t(key int, value text)") + + # stop specified sks and check whether writes work + for sk in sks_to_stop: + sk.stop() + if should_work_when_stopped: + log.info("checking that writes still work") + ep.safe_psql("insert into t select generate_series(1, 100), 'Papaya'") + # restarting ep should also be fine + ep.stop() + ep.start() + ep.safe_psql("insert into t select generate_series(1, 100), 'plum'") + bg_query = None + else: + log.info("checking that writes hang") + bg_query = await assert_query_hangs( + ep, "insert into t select generate_series(1, 100), 'Papaya'" + ) + # start again; now they should work + for sk in sks_to_stop: + sk.start() + if bg_query: + log.info("awaiting query") + await bg_query + + +# It's a bit tempting to iterate over all possible combinations, but let's stick +# with this for now. +async def run_quorum_sanity(env: NeonEnv): + # 3 members, all up, should work + await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [], True) + # 3 members, 2/3 up, should work + await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [3], True) + # 3 members, 1/3 up, should not work + await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [2, 3], False) + + # 3 members, all up, should work; wp redundantly talks to 4th. + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], None, [], True) + # 3 members, all up, should work with wp talking to 2 of these 3 + plus one redundant + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [], True) + # 3 members, 2/3 up, could work but wp talks to different 3s, so it shouldn't + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [3], False) + + # joint conf of 1-2-3 and 4, all up, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [], True) + # joint conf of 1-2-3 and 4, 4 down, shouldn't work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [4], False) + + # joint conf of 1-2-3 and 2-3-4, all up, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [], True) + # joint conf of 1-2-3 and 2-3-4, 1 and 4 down, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 4], True) + # joint conf of 1-2-3 and 2-3-4, 2 down, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2], True) + # joint conf of 1-2-3 and 2-3-4, 3 down, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [3], True) + # joint conf of 1-2-3 and 2-3-4, 1 and 2 down, shouldn't work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 2], False) + # joint conf of 1-2-3 and 2-3-4, 2 and 4 down, shouldn't work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2, 4], False) + + # joint conf of 1-2-3 and 2-3-4 with wp talking to 2-3-4 only. + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [], True) + # with 1 down should still be ok + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [1], True) + # but with 2 down not ok + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [2], False) + + +# Test various combinations of membership configurations / neon.safekeepers +# (list of safekeepers endpoint connects to) values / up & down safekeepers and +# check that endpont can start and write data when we have quorum and can't when +# we don't. +def test_quorum_sanity(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 4 + env = neon_env_builder.init_start() + + asyncio.run(run_quorum_sanity(env)) + + async def run_segment_init_failure(env: NeonEnv): env.create_branch("test_segment_init_failure") ep = env.endpoints.create_start("test_segment_init_failure") diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 35bc1b0cba..a0391901a2 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 35bc1b0cba55680e3b37abce4e67a46bb15f3315 +Subproject commit a0391901a2af13aa029b905272a5b2024133c926 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 6cea02e23c..aeb292eeac 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 6cea02e23caa950d5f06932491a91b6af8f54360 +Subproject commit aeb292eeace9072e07071254b6ffc7a74007d4d2 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 473f68210d..d56e79cd5d 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 473f68210d52ff8508f71c15b0c77c01296f4ace +Subproject commit d56e79cd5d6136c159b1d8d98acb7981d4b69364 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 22533c63fc..66114c23bc 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 22533c63fc42cdc1dbe138650ba1eca10a70c5d7 +Subproject commit 66114c23bc61205b0e3fb1e77ee76a4abc1eb4b8 diff --git a/vendor/revisions.json b/vendor/revisions.json index 7b2d5fda8e..d7eddf42b7 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.4", - "22533c63fc42cdc1dbe138650ba1eca10a70c5d7" + "66114c23bc61205b0e3fb1e77ee76a4abc1eb4b8" ], "v16": [ "16.8", - "473f68210d52ff8508f71c15b0c77c01296f4ace" + "d56e79cd5d6136c159b1d8d98acb7981d4b69364" ], "v15": [ "15.12", - "6cea02e23caa950d5f06932491a91b6af8f54360" + "aeb292eeace9072e07071254b6ffc7a74007d4d2" ], "v14": [ "14.17", - "35bc1b0cba55680e3b37abce4e67a46bb15f3315" + "a0391901a2af13aa029b905272a5b2024133c926" ] }